On Mon, Jun 23, 2025 at 3:36 PM Hongtao Liu <crazy...@gmail.com> wrote:
>
> On Thu, Jun 19, 2025 at 10:25 AM H.J. Lu <hjl.to...@gmail.com> wrote:
> >
> > Extend the remove_redundant_vector pass to handle vector broadcasts from
> > constant and variable scalars.  When broadcasting from constants and
> > function arguments, we can place a single widest vector broadcast at
> > entry of the nearest common dominator for basic blocks with all uses
> > since constants and function arguments aren't changed.  For broadcast
> > from variables with a single definition, the single definition is
> > replaced with the widest broadcast.
> >
> > gcc/
> >
> >         PR target/92080
> >         * config/i386/i386-expand.cc (ix86_expand_call): Set
> >         recursive_function to true for recursive call.
> >         * config/i386/i386-features.cc (ix86_place_single_vector_set):
> >         Add an argument for inner scalar, default to nullptr.  Set the
> >         source from inner scalar if not nullptr.
> >         (ix86_get_vector_load_mode): Renamed to ...
> >         (ix86_get_vector_cse_mode): This.  Add an argument for scalar mode
> >         and handle integer and float scalar modes.
> >         (replace_vector_const): Add an argument for scalar mode and pass
> >         it to ix86_get_vector_load_mode.
> >         (x86_cse_kind): New.
> >         (redundant_load): Likewise.
> >         (ix86_broadcast_inner): Likewise.
> >         (remove_redundant_vector_load): Also support const0_rtx and
> >         constm1_rtx broadcasts.  Handle vector broadcasts from constant
> >         and variable scalars.
> >         * config/i386/i386.h (machine_function): Add recursive_function.
> >
> > gcc/testsuite/
> >
> >         * gcc.target/i386/keylocker-aesdecwide128kl.c: Updated to expect
> >         movdqa instead pxor.
> >         * gcc.target/i386/keylocker-aesdecwide256kl.c: Likewise.
> >         * gcc.target/i386/keylocker-aesencwide128kl.c: Likewise.
> >         * gcc.target/i386/keylocker-aesencwide256kl.c: Likewise.
> >         * gcc.target/i386/pr92080-4.c: New test.
> >         * gcc.target/i386/pr92080-5.c: Likewise.
> >         * gcc.target/i386/pr92080-6.c: Likewise.
> >         * gcc.target/i386/pr92080-7.c: Likewise.
> >         * gcc.target/i386/pr92080-8.c: Likewise.
> >         * gcc.target/i386/pr92080-9.c: Likewise.
> >         * gcc.target/i386/pr92080-10.c: Likewise.
> >         * gcc.target/i386/pr92080-11.c: Likewise.
> >         * gcc.target/i386/pr92080-12.c: Likewise.
> >         * gcc.target/i386/pr92080-13.c: Likewise.
> >         * gcc.target/i386/pr92080-14.c: Likewise.
> >         * gcc.target/i386/pr92080-15.c: Likewise.
> >         * gcc.target/i386/pr92080-16.c: Likewise.
> >
> > Signed-off-by: H.J. Lu <hjl.to...@gmail.com>
> > ---
> >  gcc/config/i386/i386-expand.cc                |   3 +
> >  gcc/config/i386/i386-features.cc              | 410 ++++++++++++++----
> >  gcc/config/i386/i386.h                        |   3 +
> >  .../i386/keylocker-aesdecwide128kl.c          |  14 +-
> >  .../i386/keylocker-aesdecwide256kl.c          |  14 +-
> >  .../i386/keylocker-aesencwide128kl.c          |  14 +-
> >  .../i386/keylocker-aesencwide256kl.c          |  14 +-
> >  gcc/testsuite/gcc.target/i386/pr92080-10.c    |  13 +
> >  gcc/testsuite/gcc.target/i386/pr92080-11.c    |  33 ++
> >  gcc/testsuite/gcc.target/i386/pr92080-12.c    |  16 +
> >  gcc/testsuite/gcc.target/i386/pr92080-13.c    |  32 ++
> >  gcc/testsuite/gcc.target/i386/pr92080-14.c    |  31 ++
> >  gcc/testsuite/gcc.target/i386/pr92080-15.c    |  25 ++
> >  gcc/testsuite/gcc.target/i386/pr92080-16.c    |  26 ++
> >  gcc/testsuite/gcc.target/i386/pr92080-4.c     |  50 +++
> >  gcc/testsuite/gcc.target/i386/pr92080-5.c     | 109 +++++
> >  gcc/testsuite/gcc.target/i386/pr92080-6.c     |  19 +
> >  gcc/testsuite/gcc.target/i386/pr92080-7.c     |  20 +
> >  gcc/testsuite/gcc.target/i386/pr92080-8.c     |  16 +
> >  gcc/testsuite/gcc.target/i386/pr92080-9.c     |  81 ++++
> >  20 files changed, 823 insertions(+), 120 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-10.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-11.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-12.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-13.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-14.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-15.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-16.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-4.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-5.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-6.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-7.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-8.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-9.c
> >
> > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> > index 4946f87a131..60c1ac5d2b1 100644
> > --- a/gcc/config/i386/i386-expand.cc
> > +++ b/gcc/config/i386/i386-expand.cc
> > @@ -10154,6 +10154,9 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx 
> > callarg1,
> >           else if (lookup_attribute ("no_callee_saved_registers",
> >                                      TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
> >             call_no_callee_saved_registers = true;
> > +         if (fndecl == current_function_decl
> > +             && decl_binds_to_current_def_p (fndecl))
> > +           cfun->machine->recursive_function = true;
> >         }
> >      }
> >    else
> > diff --git a/gcc/config/i386/i386-features.cc 
> > b/gcc/config/i386/i386-features.cc
> > index 56ab7f2d23b..2c49d11da8f 100644
> > --- a/gcc/config/i386/i386-features.cc
> > +++ b/gcc/config/i386/i386-features.cc
> > @@ -3088,10 +3088,12 @@ ix86_rpad_gate ()
> >  /* Generate a vector set, DEST = SRC, at entry of the nearest dominator
> >     for basic block map BBS, which is in the fake loop that contains the
> >     whole function, so that there is only a single vector set in the
> > -   whole function.   */
> > +   whole function.  If not nullptr, INNER_SCALAR is the inner scalar of
> > +   SRC, as (reg:SI 99) in (vec_duplicate:V4SI (reg:SI 99)).  */
> >
> >  static void
> > -ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs)
> > +ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs,
> > +                             rtx inner_scalar = nullptr)
> >  {
> >    basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
> >    while (bb->loop_father->latch
> > @@ -3112,10 +3114,23 @@ ix86_place_single_vector_set (rtx dest, rtx src, 
> > bitmap bbs)
> >        insn = NEXT_INSN (insn);
> >      }
> >
> > +  rtx_insn *set_insn;
> >    if (insn == BB_HEAD (bb))
> > -    emit_insn_before (set, insn);
> > +    set_insn = emit_insn_before (set, insn);
> >    else
> > -    emit_insn_after (set, insn ? PREV_INSN (insn) : BB_END (bb));
> > +    set_insn = emit_insn_after (set,
> > +                               insn ? PREV_INSN (insn) : BB_END (bb));
> > +
> > +  if (inner_scalar)
> > +    {
> > +      /* Set the source in (vec_duplicate:V4SI (reg:SI 99)).  */
> > +      rtx reg = XEXP (src, 0);
> > +      if ((REG_P (inner_scalar) || MEM_P (inner_scalar))
> > +         && GET_MODE (reg) != GET_MODE (inner_scalar))
> > +       inner_scalar = gen_rtx_SUBREG (GET_MODE (reg), inner_scalar, 0);
> > +      rtx set = gen_rtx_SET (reg, inner_scalar);
> > +      emit_insn_before (set, set_insn);
> > +    }
> >  }
> >
> >  /* At entry of the nearest common dominator for basic blocks with
> > @@ -3346,26 +3361,15 @@ make_pass_remove_partial_avx_dependency 
> > (gcc::context *ctxt)
> >    return new pass_remove_partial_avx_dependency (ctxt);
> >  }
> >
> > -/* Return a machine mode suitable for vector SIZE.  */
> > +/* Return a machine mode suitable for vector SIZE with SMODE inner
> > +   mode.  */
> >
> >  static machine_mode
> > -ix86_get_vector_load_mode (unsigned int size)
> > +ix86_get_vector_cse_mode (unsigned int size, machine_mode smode)
> >  {
> > -  machine_mode mode;
> > -  if (size == 64)
> > -    mode = V64QImode;
> > -  else if (size == 32)
> > -    mode = V32QImode;
> > -  else if (size == 16)
> > -    mode = V16QImode;
> > -  else if (size == 8)
> > -    mode = V8QImode;
> > -  else if (size == 4)
> > -    mode = V4QImode;
> > -  else if (size == 2)
> > -    mode = V2QImode;
> > -  else
> > -    gcc_unreachable ();
> > +  scalar_mode s_mode = as_a <scalar_mode> (smode);
> > +  poly_uint64 nunits = size / GET_MODE_SIZE (smode);
> > +  machine_mode mode = mode_for_vector (s_mode, nunits).require ();
> >    return mode;
> >  }
> >
> > @@ -3374,7 +3378,8 @@ ix86_get_vector_load_mode (unsigned int size)
> >
> >  static void
> >  replace_vector_const (machine_mode vector_mode, rtx vector_const,
> > -                     auto_bitmap &vector_insns)
> > +                     auto_bitmap &vector_insns,
> > +                     machine_mode scalar_mode)
> >  {
> >    bitmap_iterator bi;
> >    unsigned int id;
> > @@ -3386,7 +3391,8 @@ replace_vector_const (machine_mode vector_mode, rtx 
> > vector_const,
> >        /* Get the single SET instruction.  */
> >        rtx set = single_set (insn);
> >        rtx src = SET_SRC (set);
> > -      machine_mode mode = GET_MODE (src);
> > +      rtx dest = SET_DEST (set);
> > +      machine_mode mode = GET_MODE (dest);
> >
> >        rtx replace;
> >        /* Replace the source operand with VECTOR_CONST.  */
> > @@ -3400,7 +3406,8 @@ replace_vector_const (machine_mode vector_mode, rtx 
> > vector_const,
> >               /* If the mode size is smaller than its natural size,
> >                  first insert an extra move with a QI vector SUBREG
> >                  of the same size to avoid validate_subreg failure.  */
> > -             machine_mode vmode = ix86_get_vector_load_mode (size);
> > +             machine_mode vmode
> > +               = ix86_get_vector_cse_mode (size, scalar_mode);
> >               rtx vreg;
> >               if (mode == vmode)
> >                 vreg = vector_const;
> > @@ -3426,6 +3433,169 @@ replace_vector_const (machine_mode vector_mode, rtx 
> > vector_const,
> >      }
> >  }
> >
> > +enum x86_cse_kind
> > +{
> > +  X86_CSE_CONST0_VECTOR,
> > +  X86_CSE_CONSTM1_VECTOR,
> > +  X86_CSE_VEC_DUP
> > +};
> > +
> > +struct redundant_load
> > +{
> > +  /* Bitmap of basic blocks with broadcast instructions.  */
> > +  auto_bitmap bbs;
> > +  /* Bitmap of broadcast instructions.  */
> > +  auto_bitmap insns;
> > +  /* The broadcast inner scalar.  */
> > +  rtx val;
> > +  /* The inner scalar mode.  */
> > +  machine_mode mode;
> > +  /* The instruction which sets the inner scalar.  Nullptr if the inner
> > +     scalar is applied to the whole function, instead of within the same
> > +     block.  */
> > +  rtx_insn *def_insn;
> > +  /* The widest broadcast source.  */
> > +  rtx broadcast_source;
> > +  /* The widest broadcast register.  */
> > +  rtx broadcast_reg;
> > +  /* The basic block of the broadcast instruction.  */
> > +  basic_block bb;
> > +  /* The number of broadcast instructions with the same inner scalar.  */
> > +  unsigned HOST_WIDE_INT count;
> > +  /* The threshold of broadcast instructions with the same inner
> > +     scalar.  */
> > +  unsigned int threshold;
> > +  /* The widest broadcast size in bytes.  */
> > +  unsigned int size;
> > +  /* Load kind.  */
> > +  x86_cse_kind kind;
> > +};
> > +
> > +/* Return the inner scalar if OP is a broadcast, else return nullptr.  */
> > +
> > +static rtx
> > +ix86_broadcast_inner (rtx op, machine_mode mode,
> > +                     machine_mode *scalar_mode_p,
> > +                     x86_cse_kind *kind_p, rtx_insn **insn_p)
> > +{
> > +  if (op == const0_rtx || op == CONST0_RTX (mode))
> > +    {
> > +      *scalar_mode_p = QImode;
> > +      *kind_p = X86_CSE_CONST0_VECTOR;
> > +      *insn_p = nullptr;
> > +      return const0_rtx;
> > +    }
> > +  else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
> > +          && (op == constm1_rtx || op == CONSTM1_RTX (mode)))
> > +    {
> > +      *scalar_mode_p = QImode;
> > +      *kind_p = X86_CSE_CONSTM1_VECTOR;
> > +      *insn_p = nullptr;
> > +      return constm1_rtx;
> > +    }
> > +
> > +  mode = GET_MODE (op);
> > +  int nunits = GET_MODE_NUNITS (mode);
> > +  if (nunits < 2)
> > +    return nullptr;
> > +
> > +  *kind_p = X86_CSE_VEC_DUP;
> > +
> > +  rtx reg;
> > +  if (GET_CODE (op) == VEC_DUPLICATE)
> > +    {
> > +      /* Only
> > +         (vec_duplicate:V4SI (reg:SI 99))
> > +         (vec_duplicate:V2DF (mem/u/c:DF (symbol_ref/u:DI ("*.LC1") [flags 
> > 0x2]) [0  S8 A64]))
> > +        are supported.  */
> > +      op = XEXP (op, 0);
> > +      reg = op;
> > +      if (SUBREG_P (op)
> > +         && SUBREG_BYTE (op) == 0
> > +         && !paradoxical_subreg_p (op))
> > +       reg = SUBREG_REG (op);
> > +      if (!REG_P (reg))
> > +       {
> > +         if (MEM_P (op)
> > +             && SYMBOL_REF_P (XEXP (op, 0))
> > +             && CONSTANT_POOL_ADDRESS_P (XEXP (op, 0)))
> > +           {
> > +             /* Handle constant broadcast from memory.  */
> > +             *scalar_mode_p = GET_MODE_INNER (mode);
> > +             *insn_p = nullptr;
> > +             return op;
> > +           }
> > +         return nullptr;
> > +       }
> > +    }
> > +  else if (CONST_VECTOR_P (op))
> > +    {
> > +      rtx first = XVECEXP (op, 0, 0);
> > +      for (int i = 1; i < nunits; ++i)
> > +       {
> > +         rtx tmp = XVECEXP (op, 0, i);
> > +         /* Vector duplicate value.  */
> > +         if (!rtx_equal_p (tmp, first))
> > +           return nullptr;
> > +       }
> > +      *scalar_mode_p = GET_MODE (first);
> > +      *insn_p = nullptr;
> > +      return first;
> > +    }
> > +  else
> > +    return nullptr;
> > +
> > +  mode = GET_MODE (op);
> > +
> > +  /* Only single def chain is supported.  */
> > +  df_ref ref = DF_REG_DEF_CHAIN (REGNO (reg));
> > +  if (!ref
> > +      || DF_REF_IS_ARTIFICIAL (ref)
> > +      || DF_REF_NEXT_REG (ref) != nullptr)
> > +    return nullptr;
> > +
> > +  rtx_insn *insn = DF_REF_INSN (ref);
> > +  rtx set = single_set (insn);
> > +  if (!set)
> > +    return nullptr;
> > +
> > +  rtx dest = SET_DEST (set);
> > +
> > +  op = SET_SRC (set);
> > +  /* Set *INSN_P if the scalar source isn't a constant nor an incoming
> > +     argument.  */
> > +  if (CONST_INT_P (op) || CONST_DOUBLE_P (op))
> > +    *insn_p = nullptr;
> > +  else if (REG_P (op)
> > +          && REG_EXPR (op)
> > +          && TREE_CODE (REG_EXPR (op)) == PARM_DECL)
> > +    *insn_p = nullptr;
> > +  else if (MEM_P (op)
> > +          && MEM_EXPR (op)
> > +          && TREE_CODE (get_base_address (MEM_EXPR (op))) == PARM_DECL)
> > +    *insn_p = nullptr;
> > +  else
> > +    {
> > +      while (SUBREG_P (dest))
> > +       dest = SUBREG_REG (dest);
> > +
> > +      /* Skip if the SET destination mode doesn't match.  */
> > +      if (GET_MODE (dest) != mode)
> > +       return nullptr;
> > +
> > +      /* Set the inner scalar to the SET destination. */
> > +      op = dest;
> > +      *insn_p = insn;
> > +    }
> > +
> > +  *scalar_mode_p = mode;
> > +  if (CONSTANT_P (op))
> > +    *insn_p = nullptr;
> > +  else
> > +    *insn_p = insn;
> > +  return op;
> > +}
> > +
> >  /* At entry of the nearest common dominator for basic blocks with vector
> >     CONST0_RTX and integer CONSTM1_RTX uses, generate a single widest
> >     vector set instruction for all CONST0_RTX and integer CONSTM1_RTX
> > @@ -3440,20 +3610,16 @@ remove_redundant_vector_load (void)
> >  {
> >    timevar_push (TV_MACH_DEP);
> >
> > -  auto_bitmap zero_bbs;
> > -  auto_bitmap m1_bbs;
> > -  auto_bitmap zero_insns;
> > -  auto_bitmap m1_insns;
> > -
> > +  auto_vec<redundant_load *> loads;
> > +  redundant_load *load;
> >    basic_block bb;
> >    rtx_insn *insn;
> > -  unsigned HOST_WIDE_INT zero_count = 0;
> > -  unsigned HOST_WIDE_INT m1_count = 0;
> > -  unsigned int zero_size = 0;
> > -  unsigned int m1_size = 0;
> > +  unsigned int i;
> >
> >    df_set_flags (DF_DEFER_INSN_RESCAN);
> >
> > +  bool recursive_call_p = cfun->machine->recursive_function;
> > +
> >    FOR_EACH_BB_FN (bb, cfun)
> >      {
> >        FOR_BB_INSNS (bb, insn)
> > @@ -3481,79 +3647,139 @@ remove_redundant_vector_load (void)
> >           if (!REG_P (dest) && !SUBREG_P (dest))
> >             continue;
> >
> > -         if (src == CONST0_RTX (mode))
> > -           {
> > -             /* Record vector instruction with CONST0_RTX.  */
> > -             bitmap_set_bit (zero_insns, INSN_UID (insn));
> > +         rtx_insn *def_insn;
> > +         machine_mode scalar_mode;
> > +         x86_cse_kind kind;
> > +         rtx val = ix86_broadcast_inner (src, mode, &scalar_mode,
> > +                                         &kind, &def_insn);
> > +         if (!val)
> > +           continue;
> >
> > -             /* Record the maximum vector size.  */
> > -             if (zero_size < GET_MODE_SIZE (mode))
> > -               zero_size = GET_MODE_SIZE (mode);
> > +          /* Remove redundant register loads if there are more than 2
> > +             loads will be used.  */
> > +         unsigned int threshold = 2;
> > +
> > +         /* Check if there is a matching redundant vector load.   */
> > +         bool matched = false;
> > +         FOR_EACH_VEC_ELT (loads, i, load)
> > +           if (load->val
> > +               && load->kind == kind
> > +               && load->mode == scalar_mode
> > +               && (load->bb == bb
> > +                   || kind < X86_CSE_VEC_DUP
> > +                   /* Non all 0s/1s vector load must be in the same
> > +                      basic block if it is in a recursive call.  */
> > +                   || !recursive_call_p)
> > +               && rtx_equal_p (load->val, val))
> So I guess cse_insn across recursive call make RA different to
> allocate with caller_saved registers since they're in the same
> register and need to be saved and restored due to ABI?
> It may be ok for different calls since those cse_insns can be
> allocated with different registers and can be handled well for IPA-RA?

Since callee-saved registers can be used, caller doesn't
need to save and restore them around the call.  When caller
== callee, it becomes expensive.

> > +             {
> > +               /* Record vector instruction.  */
> > +               bitmap_set_bit (load->insns, INSN_UID (insn));
> >
> > -             /* Record the basic block with CONST0_RTX.  */
> > -             bitmap_set_bit (zero_bbs, bb->index);
> > -             zero_count++;
> > -           }
> > -         else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
> > -                  && src == CONSTM1_RTX (mode))
> > -           {
> > -             /* Record vector instruction with CONSTM1_RTX.  */
> > -             bitmap_set_bit (m1_insns, INSN_UID (insn));
> > +               /* Record the maximum vector size.  */
> > +               if (load->size < GET_MODE_SIZE (mode))
> > +                 load->size = GET_MODE_SIZE (mode);
> >
> > -             /* Record the maximum vector size.  */
> > -             if (m1_size < GET_MODE_SIZE (mode))
> > -               m1_size = GET_MODE_SIZE (mode);
> > +               /* Record the basic block.  */
> > +               bitmap_set_bit (load->bbs, bb->index);
> > +               load->count++;
> > +               matched = true;
> > +               break;
> > +             }
> >
> > -             /* Record the basic block with CONSTM1_RTX.  */
> > -             bitmap_set_bit (m1_bbs, bb->index);
> > -             m1_count++;
> > -           }
> > -       }
> > -    }
> > +         if (matched)
> > +           continue;
> >
> > -  if (zero_count > 1 || m1_count > 1)
> > -    {
> > -      machine_mode zero_mode, m1_mode;
> > -      rtx vector_const0, vector_constm1;
> > +         /* We see this vector broadcast the first time.  */
> > +         load = new redundant_load;
> >
> > -      if (zero_count > 1)
> > -       {
> > -         zero_mode = ix86_get_vector_load_mode (zero_size);
> > -         vector_const0 = gen_reg_rtx (zero_mode);
> > -         replace_vector_const (zero_mode, vector_const0, zero_insns);
> > -       }
> > -      else
> > -       {
> > -         zero_mode = VOIDmode;
> > -         vector_const0 = nullptr;
> > -       }
> > +         load->val = copy_rtx (val);
> > +         load->mode = scalar_mode;
> > +         load->size = GET_MODE_SIZE (mode);
> > +         load->def_insn = def_insn;
> > +         load->count = 1;
> > +         load->threshold = threshold;
> > +         load->bb = BLOCK_FOR_INSN (insn);
> > +         load->kind = kind;
> >
> > -      if (m1_count > 1)
> > -       {
> > -         m1_mode = ix86_get_vector_load_mode (m1_size);
> > -         vector_constm1 = gen_reg_rtx (m1_mode);
> > -         replace_vector_const (m1_mode, vector_constm1, m1_insns);
> > -       }
> > -      else
> > -       {
> > -         m1_mode = VOIDmode;
> > -         vector_constm1 = nullptr;
> > +         bitmap_set_bit (load->insns, INSN_UID (insn));
> > +         bitmap_set_bit (load->bbs, bb->index);
> > +
> > +         loads.safe_push (load);
> >         }
> > +    }
> >
> > +  bool replaced = false;
> > +  rtx reg, broadcast_source, broadcast_reg;
> > +  FOR_EACH_VEC_ELT (loads, i, load)
> > +    if (load->count >= load->threshold)
> > +      {
> > +       machine_mode mode = ix86_get_vector_cse_mode (load->size,
> > +                                                     load->mode);
> > +       broadcast_reg = gen_reg_rtx (mode);
> > +       if (load->def_insn)
> > +         {
> > +           /* Replace redundant vector loads with a single vector load
> > +              in the same basic block.  */
> > +           reg = load->val;
> > +           if (load->mode != GET_MODE (reg))
> > +             reg = gen_rtx_SUBREG (load->mode, reg, 0);
> > +           broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg);
> > +           replace_vector_const (mode, broadcast_reg, load->insns,
> > +                                 load->mode);
> > +         }
> > +       else
> > +         {
> > +           /* This is a constant integer/double vector.  If the
> > +              inner scalar is 0 or -1, set vector to CONST0_RTX
> > +              or CONSTM1_RTX directly.  */
> > +           rtx reg;
> > +           switch (load->kind)
> > +             {
> > +             case X86_CSE_CONST0_VECTOR:
> > +               broadcast_source = CONST0_RTX (mode);
> > +               break;
> > +             case X86_CSE_CONSTM1_VECTOR:
> > +               broadcast_source = CONSTM1_RTX (mode);
> > +               break;
> > +             default:
> > +               reg = gen_reg_rtx (load->mode);
> > +               broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg);
> > +               break;
> > +             }
> > +           replace_vector_const (mode, broadcast_reg, load->insns,
> > +                                 load->mode);
> > +         }
> > +       load->broadcast_source = broadcast_source;
> > +       load->broadcast_reg = broadcast_reg;
> > +       replaced = true;
> > +      }
> > +
> > +  if (replaced)
> > +    {
> >        /* (Re-)discover loops so that bb->loop_father can be used in the
> >          analysis below.  */
> >        calculate_dominance_info (CDI_DOMINATORS);
> >        loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
> >
> > -      if (vector_const0)
> > -       ix86_place_single_vector_set (vector_const0,
> > -                                     CONST0_RTX (zero_mode),
> > -                                     zero_bbs);
> > -
> > -      if (vector_constm1)
> > -       ix86_place_single_vector_set (vector_constm1,
> > -                                     CONSTM1_RTX (m1_mode),
> > -                                     m1_bbs);
> > +      FOR_EACH_VEC_ELT (loads, i, load)
> > +       if (load->count >= load->threshold)
> > +         {
> > +           if (load->def_insn)
> > +             {
> > +               /* Insert a broadcast after the original scalar
> > +                  definition.  */
> > +               rtx set = gen_rtx_SET (load->broadcast_reg,
> > +                                      load->broadcast_source);
> > +               insn = emit_insn_after (set, load->def_insn);
> > +             }
> > +           else
> > +             ix86_place_single_vector_set (load->broadcast_reg,
> > +                                           load->broadcast_source,
> > +                                           load->bbs,
> > +                                           (load->kind == X86_CSE_VEC_DUP
> > +                                            ? load->val
> > +                                            : nullptr));
> > +         }
> >
> >        loop_optimizer_finalize ();
> >
> > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> > index 7c16eac7700..812055085bb 100644
> > --- a/gcc/config/i386/i386.h
> > +++ b/gcc/config/i386/i386.h
> > @@ -2924,6 +2924,9 @@ struct GTY(()) machine_function {
> >    /* True if inline asm with redzone clobber has been seen.  */
> >    BOOL_BITFIELD asm_redzone_clobber_seen : 1;
> >
> > +  /* True if this is a recursive function.  */
> > +  BOOL_BITFIELD recursive_function : 1;
> > +
> >    /* The largest alignment, in bytes, of stack slot actually used.  */
> >    unsigned int max_used_stack_alignment;
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide128kl.c 
> > b/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide128kl.c
> > index 93806e51508..e73ba35ddd1 100644
> > --- a/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide128kl.c
> > +++ b/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide128kl.c
> > @@ -19,14 +19,14 @@
> >  /* { dg-final { scan-assembler "(?:movdqu|movups)\[ 
> > \\t\]+\[^\\n\\r\]*%xmm5,\[^\\n\\r\]*80\[^\\n\\r\]*" } } */
> >  /* { dg-final { scan-assembler "(?:movdqu|movups)\[ 
> > \\t\]+\[^\\n\\r\]*%xmm6,\[^\\n\\r\]*96\[^\\n\\r\]*" } } */
> >  /* { dg-final { scan-assembler "(?:movdqu|movups)\[ 
> > \\t\]+\[^\\n\\r\]*%xmm7,\[^\\n\\r\]*112\[^\\n\\r\]*" } } */
> > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm0, %xmm0" } } */
> > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm1, %xmm1" } } */
> > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm2, %xmm2" } } */
> > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm3, %xmm3" } } */
> > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm4, %xmm4" } } */
> > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm5, %xmm5" } } */
> > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm6, %xmm6" } } */
> >  /* { dg-final { scan-assembler "pxor\[ \t\]+%xmm7, %xmm7" } } */
> > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm0" } } */
> > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm1" } } */
> > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm2" } } */
> > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm3" } } */
> > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm4" } } */
> > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm5" } } */
> > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm6" } } */
> >
> >  #include <immintrin.h>
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide256kl.c 
> > b/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide256kl.c
> > index f9ccc82c7ca..33cd998bfdf 100644
> > --- a/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide256kl.c
> > +++ b/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide256kl.c
> > @@ -19,14 +19,14 @@
> >  /* { dg-final { scan-assembler "(?:movdqu|movups)\[ 
> > \\t\]+\[^\\n\\r\]*%xmm5,\[^\\n\\r\]*80\[^\\n\\r\]*" } } */
> >  /* { dg-final { scan-assembler "(?:movdqu|movups)\[ 
> > \\t\]+\[^\\n\\r\]*%xmm6,\[^\\n\\r\]*96\[^\\n\\r\]*" } } */
> >  /* { dg-final { scan-assembler "(?:movdqu|movups)\[ 
> > \\t\]+\[^\\n\\r\]*%xmm7,\[^\\n\\r\]*112\[^\\n\\r\]*" } } */
> > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm0, %xmm0" } } */
> > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm1, %xmm1" } } */
> > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm2, %xmm2" } } */
> > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm3, %xmm3" } } */
> > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm4, %xmm4" } } */
> > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm5, %xmm5" } } */
> > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm6, %xmm6" } } */
> >  /* { dg-final { scan-assembler "pxor\[ \t\]+%xmm7, %xmm7" } } */
> > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm0" } } */
> > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm1" } } */
> > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm2" } } */
> > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm3" } } */
> > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm4" } } */
> > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm5" } } */
> > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm6" } } */
> >
> >  #include <immintrin.h>
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/keylocker-aesencwide128kl.c 
> > b/gcc/testsuite/gcc.target/i386/keylocker-aesencwide128kl.c
> > index c0fcd28fb07..75106e59b77 100644
> > --- a/gcc/testsuite/gcc.target/i386/keylocker-aesencwide128kl.c
> > +++ b/gcc/testsuite/gcc.target/i386/keylocker-aesencwide128kl.c
> > @@ -19,14 +19,14 @@
> >  /* { dg-final { scan-assembler "(?:movdqu|movups)\[ 
> > \\t\]+\[^\\n\\r\]*%xmm5,\[^\\n\\r\]*80\[^\\n\\r\]*" } } */
> >  /* { dg-final { scan-assembler "(?:movdqu|movups)\[ 
> > \\t\]+\[^\\n\\r\]*%xmm6,\[^\\n\\r\]*96\[^\\n\\r\]*" } } */
> >  /* { dg-final { scan-assembler "(?:movdqu|movups)\[ 
> > \\t\]+\[^\\n\\r\]*%xmm7,\[^\\n\\r\]*112\[^\\n\\r\]*" } } */
> > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm0, %xmm0" } } */
> > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm1, %xmm1" } } */
> > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm2, %xmm2" } } */
> > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm3, %xmm3" } } */
> > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm4, %xmm4" } } */
> > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm5, %xmm5" } } */
> > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm6, %xmm6" } } */
> >  /* { dg-final { scan-assembler "pxor\[ \t\]+%xmm7, %xmm7" } } */
> > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm0" } } */
> > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm1" } } */
> > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm2" } } */
> > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm3" } } */
> > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm4" } } */
> > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm5" } } */
> > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm6" } } */
> >
> >  #include <immintrin.h>
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/keylocker-aesencwide256kl.c 
> > b/gcc/testsuite/gcc.target/i386/keylocker-aesencwide256kl.c
> > index 31463a8b2da..2787732229a 100644
> > --- a/gcc/testsuite/gcc.target/i386/keylocker-aesencwide256kl.c
> > +++ b/gcc/testsuite/gcc.target/i386/keylocker-aesencwide256kl.c
> > @@ -19,14 +19,14 @@
> >  /* { dg-final { scan-assembler "(?:movdqu|movups)\[ 
> > \\t\]+\[^\\n\\r\]*%xmm5,\[^\\n\\r\]*80\[^\\n\\r\]*" } } */
> >  /* { dg-final { scan-assembler "(?:movdqu|movups)\[ 
> > \\t\]+\[^\\n\\r\]*%xmm6,\[^\\n\\r\]*96\[^\\n\\r\]*" } } */
> >  /* { dg-final { scan-assembler "(?:movdqu|movups)\[ 
> > \\t\]+\[^\\n\\r\]*%xmm7,\[^\\n\\r\]*112\[^\\n\\r\]*" } } */
> > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm0, %xmm0" } } */
> > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm1, %xmm1" } } */
> > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm2, %xmm2" } } */
> > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm3, %xmm3" } } */
> > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm4, %xmm4" } } */
> > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm5, %xmm5" } } */
> > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm6, %xmm6" } } */
> >  /* { dg-final { scan-assembler "pxor\[ \t\]+%xmm7, %xmm7" } } */
> > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm0" } } */
> > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm1" } } */
> > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm2" } } */
> > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm3" } } */
> > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm4" } } */
> > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm5" } } */
> > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm6" } } */
> >
> >  #include <immintrin.h>
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-10.c 
> > b/gcc/testsuite/gcc.target/i386/pr92080-10.c
> > new file mode 100644
> > index 00000000000..b67f9d8d285
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr92080-10.c
> > @@ -0,0 +1,13 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-march=sapphirerapids -Ofast" } */
> > +/* { dg-final { scan-assembler-times "vpbroadcastw" 1 } } */
> > +
> > +extern short write_picture_p_Vid_0;
> > +extern unsigned short *write_picture_p_2_0_0;
> > +extern int write_picture_p_0, write_picture_p_1, write_picture_i;
> > +void write_picture() {
> > +  unsigned short cr_val = 1 << write_picture_p_Vid_0;
> > +  for (; write_picture_p_1;)
> > +    for (; write_picture_i < write_picture_p_0; write_picture_i++)
> > +      write_picture_p_2_0_0[write_picture_i] = cr_val;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-11.c 
> > b/gcc/testsuite/gcc.target/i386/pr92080-11.c
> > new file mode 100644
> > index 00000000000..8747fc47640
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr92080-11.c
> > @@ -0,0 +1,33 @@
> > +/* { dg-do run { target { avx512f_runtime } } } */
> > +/* { dg-options "-mavx512f -mtune=icelake-server -O3" } */
> > +
> > +struct s {
> > +  char s[sizeof(long double)];
> > +};
> > +
> > +union u {
> > +  long double d;
> > +  struct s s;
> > +};
> > +
> > +int main()
> > +{
> > +  union u x = {0};
> > +#if __SIZEOF_LONG_DOUBLE__ == 16
> > +  x.s = (struct s){"xxxxxxxxxxxxxxxx"};
> > +#elif __SIZEOF_LONG_DOUBLE__ == 12
> > +  x.s = (struct s){"xxxxxxxxxxxx"};
> > +#elif __SIZEOF_LONG_DOUBLE__ == 8
> > +  x.s = (struct s){"xxxxxxxx"};
> > +#elif __SIZEOF_LONG_DOUBLE__ == 4
> > +  x.s = (struct s){"xxxx"};
> > +#endif
> > +
> > +  union u y = x;
> > +
> > +  for (unsigned char *p = (unsigned char *)&y + sizeof y;
> > +       p-- > (unsigned char *)&y;)
> > +    if (*p != (unsigned char)'x')
> > +      __builtin_abort ();
> > +  return 0;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-12.c 
> > b/gcc/testsuite/gcc.target/i386/pr92080-12.c
> > new file mode 100644
> > index 00000000000..cb09eb2f0a8
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr92080-12.c
> > @@ -0,0 +1,16 @@
> > +/* { dg-do compile } */
> > +/* { dg-additional-options "-O3 -mno-mmx -march=icelake-server" } */
> > +/* { dg-final { scan-assembler-times "vpbroadcastb" 1 } } */
> > +
> > +signed char a;
> > +signed char f (int i, int j)
> > +{
> > +  signed char c;
> > +  while (i != 0)
> > +  {
> > +    a ^= j;
> > +    ++c;
> > +    ++i;
> > +  }
> > +  return c;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-13.c 
> > b/gcc/testsuite/gcc.target/i386/pr92080-13.c
> > new file mode 100644
> > index 00000000000..24b7616c894
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr92080-13.c
> > @@ -0,0 +1,32 @@
> > +/* { dg-do run { target { avx512f_runtime } } } */
> > +/* { dg-options "-mavx512f -mtune=icelake-server -O2 -save-temps" } */
> > +/* { dg-final { scan-assembler-times "vpbroadcastd" 2 } } */
> > +
> > +#include <assert.h>
> > +
> > +#define CONTAINER_KIND union
> > +
> > +typedef CONTAINER_KIND container { int value; } container;
> > +
> > +void move(container* end, container* start) {
> > +    container* p;
> > +    for (p = end; p > start; p--) {
> > +       (p)->value = (p-1)->value;
> > +    }
> > +}
> > +
> > +#define N 100
> > +
> > +int main(int argc, char* argv[]) {
> > +    container vals[N];
> > +    int i;
> > +    for (i=0; i<N; i++) {
> > +        vals[i].value = argc + i;
> > +    }
> > +    move(&vals[N-1], &vals[0]);
> > +    assert(vals[0].value == argc + 0);
> > +    for (i=1; i<N; i++) {
> > +        assert(vals[i].value == argc + i - 1);
> > +    }
> > +    return 0;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-14.c 
> > b/gcc/testsuite/gcc.target/i386/pr92080-14.c
> > new file mode 100644
> > index 00000000000..6be41b63400
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr92080-14.c
> > @@ -0,0 +1,31 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-march=x86-64-v4 -O2" } */
> > +/* { dg-final { scan-assembler-times "vpbroadcastd" 1 } } */
> > +
> > +typedef int v16si __attribute__((vector_size(64)));
> > +typedef int v8si __attribute__((vector_size(32)));
> > +typedef int v4si __attribute__((vector_size(16)));
> > +
> > +extern v16si sinksz;
> > +extern v8si sinksy;
> > +extern v4si sinksx;
> > +extern v4si sinksx1;
> > +
> > +extern void bar (void);
> > +
> > +void
> > +foo (char c, int i)
> > +{
> > +  sinksz = __extension__(v16si){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
> > +  if (i == 1)
> > +    {
> > +      sinksy = __extension__(v8si){c,c,c,c,c,c,c,c};
> > +      bar ();
> > +    }
> > +  else if (i == 2)
> > +    {
> > +      sinksx = __extension__(v4si){c,c,c,c};
> > +      bar ();
> > +    }
> > +  sinksx1 = __extension__(v4si){c,c,c,c};
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-15.c 
> > b/gcc/testsuite/gcc.target/i386/pr92080-15.c
> > new file mode 100644
> > index 00000000000..fa55d82e48e
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr92080-15.c
> > @@ -0,0 +1,25 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O1 -march=x86-64-v4" } */
> > +/* { dg-final { scan-assembler-times "vpbroadcastd" 3 } } */
> > +
> > +typedef int v4si __attribute__((vector_size(16)));
> > +typedef int v8si __attribute__((vector_size(32)));
> > +typedef int v16si __attribute__((vector_size(64)));
> > +
> > +extern v4si *s1;
> > +extern v8si *s2;
> > +extern v16si *s3;
> > +
> > +int
> > +foo (int i, int j)
> > +{
> > +  if (j == 1)
> > +   s1[i] = __extension__(v4si){34, 34, 34, 34};
> > +  else if (i == 1)
> > +    s2[j] = __extension__(v8si){34, 34, 34, 34, 34, 34, 34, 34};
> > +  if ((i + j) == 1234)
> > +    i = foo (j, i);
> > +  s3[i + j] = __extension__(v16si){34, 34, 34, 34, 34, 34, 34, 34,
> > +                                  34, 34, 34, 34, 34, 34, 34, 34};
> > +  return i - j;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-16.c 
> > b/gcc/testsuite/gcc.target/i386/pr92080-16.c
> > new file mode 100644
> > index 00000000000..c8ab084b714
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr92080-16.c
> > @@ -0,0 +1,26 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O1 -march=x86-64-v4" } */
> > +/* { dg-final { scan-assembler-times "vpbroadcastd" 1 } } */
> > +
> > +typedef int v4si __attribute__((vector_size(16)));
> > +typedef int v8si __attribute__((vector_size(32)));
> > +typedef int v16si __attribute__((vector_size(64)));
> > +
> > +extern v4si *s1;
> > +extern v8si *s2;
> > +extern v16si *s3;
> > +
> > +int
> > +foo (int i, int j)
> > +{
> > +  if (j == 1)
> > +    {
> > +      s1[i] = __extension__(v4si){34, 34, 34, 34};
> > +      s2[j] = __extension__(v8si){34, 34, 34, 34, 34, 34, 34, 34};
> > +      s3[i + j] = __extension__(v16si){34, 34, 34, 34, 34, 34, 34, 34,
> > +                                      34, 34, 34, 34, 34, 34, 34, 34};
> > +    }
> > +  if ((i + j) == 1234)
> > +    i = foo (j, i);
> > +  return i - j;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-4.c 
> > b/gcc/testsuite/gcc.target/i386/pr92080-4.c
> > new file mode 100644
> > index 00000000000..ebe1384c691
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr92080-4.c
> > @@ -0,0 +1,50 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-march=x86-64-v4 -O2" } */
> > +/* { dg-final { scan-assembler-times "vpbroadcastb" 1 } } */
> > +/* { dg-final { scan-assembler-times "vpbroadcastd" 1 } } */
> > +/* { dg-final { scan-assembler-times "vpbroadcastw" 1 } } */
> > +
> > +typedef int v16si __attribute__((vector_size(64)));
> > +typedef int v8si __attribute__((vector_size(32)));
> > +typedef int v4si __attribute__((vector_size(16)));
> > +
> > +typedef short v32hi __attribute__((vector_size(64)));
> > +typedef short v16hi __attribute__((vector_size(32)));
> > +typedef short v8hi __attribute__((vector_size(16)));
> > +
> > +typedef char v64qi __attribute__((vector_size(64)));
> > +typedef char v32qi __attribute__((vector_size(32)));
> > +typedef char v16qi __attribute__((vector_size(16)));
> > +
> > +extern v16si sinksz;
> > +extern v8si sinksy;
> > +extern v4si sinksx;
> > +extern v32hi sinkhz;
> > +extern v16hi sinkhy;
> > +extern v8hi sinkhx;
> > +extern v64qi sinkbz;
> > +extern v32qi sinkby;
> > +extern v16qi sinkbx;
> > +
> > +void foo(char c) {
> > +  sinksz = __extension__(v16si){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
> > +  sinksy = __extension__(v8si){c,c,c,c,c,c,c,c};
> > +  sinksx = __extension__(v4si){c,c,c,c};
> > +}
> > +
> > +void foo1(char c) {
> > +  sinkhz = __extension__(v32hi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
> > +    c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
> > +  sinkhy = __extension__(v16hi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
> > +  sinkhx = __extension__(v8hi){c,c,c,c,c,c,c,c};
> > +}
> > +
> > +void foo2(char c) {
> > +  sinkbz = __extension__(v64qi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
> > +    c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
> > +    c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
> > +    c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
> > +  sinkby = __extension__(v32qi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
> > +    c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
> > +  sinkbx = __extension__(v16qi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-5.c 
> > b/gcc/testsuite/gcc.target/i386/pr92080-5.c
> > new file mode 100644
> > index 00000000000..380cd337e09
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr92080-5.c
> > @@ -0,0 +1,109 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -march=x86-64-v4" } */
> > +/* { dg-final { scan-assembler-times "vpbroadcastd" 3 } } */
> > +/* { dg-final { scan-assembler-times "vpbroadcastq" 1 } } */
> > +/* { dg-final { scan-assembler-times "vbroadcastsd" 1 } } */
> > +/* { dg-final { scan-assembler-times "vbroadcastss" 1 } } */
> > +
> > +typedef long long v2di __attribute__((vector_size(16)));
> > +typedef long long v4di __attribute__((vector_size(32)));
> > +typedef long long v8di __attribute__((vector_size(64)));
> > +typedef int v4si __attribute__((vector_size(16)));
> > +typedef int v8si __attribute__((vector_size(32)));
> > +typedef int v16si __attribute__((vector_size(64)));
> > +typedef short v8hi __attribute__((vector_size(16)));
> > +typedef short v16hi __attribute__((vector_size(32)));
> > +typedef short v32hi __attribute__((vector_size(64)));
> > +typedef char v16qi __attribute__((vector_size(16)));
> > +typedef char v32qi __attribute__((vector_size(32)));
> > +typedef char v64qi __attribute__((vector_size(64)));
> > +typedef float v4sf __attribute__((vector_size(16)));
> > +typedef float v8sf __attribute__((vector_size(32)));
> > +typedef float v16sf __attribute__((vector_size(64)));
> > +typedef double v2df __attribute__((vector_size(16)));
> > +typedef double v4df __attribute__((vector_size(32)));
> > +typedef double v8df __attribute__((vector_size(64)));
> > +
> > +extern v16qi b1;
> > +extern v8hi h1;
> > +extern v4si s1;
> > +extern v2di l1;
> > +extern v4sf f1;
> > +extern v2df d1;
> > +extern v32qi b2;
> > +extern v16hi h2;
> > +extern v8si s2;
> > +extern v4di l2;
> > +extern v8sf f2;
> > +extern v4df d2;
> > +extern v64qi b3;
> > +extern v32hi h3;
> > +extern v16si s3;
> > +extern v8di l3;
> > +extern v16sf f3;
> > +extern v8df d3;
> > +
> > +void
> > +foo1 ()
> > +{
> > +  b1 = __extension__(v16qi){34, 34, 34, 34, 34, 34, 34, 34,
> > +                           34, 34, 34, 34, 34, 34, 34, 34};
> > +  b2 = __extension__(v32qi){34, 34, 34, 34, 34, 34, 34, 34,
> > +                           34, 34, 34, 34, 34, 34, 34, 34,
> > +                           34, 34, 34, 34, 34, 34, 34, 34,
> > +                           34, 34, 34, 34, 34, 34, 34, 34};
> > +  b3 = __extension__(v64qi){34, 34, 34, 34, 34, 34, 34, 34,
> > +                           34, 34, 34, 34, 34, 34, 34, 34,
> > +                           34, 34, 34, 34, 34, 34, 34, 34,
> > +                           34, 34, 34, 34, 34, 34, 34, 34,
> > +                           34, 34, 34, 34, 34, 34, 34, 34,
> > +                           34, 34, 34, 34, 34, 34, 34, 34,
> > +                           34, 34, 34, 34, 34, 34, 34, 34,
> > +                           34, 34, 34, 34, 34, 34, 34, 34};
> > +}
> > +
> > +void
> > +foo2 ()
> > +{
> > +  h1 = __extension__(v8hi){34, 34, 34, 34, 34, 34, 34, 34};
> > +  h2 = __extension__(v16hi){34, 34, 34, 34, 34, 34, 34, 34,
> > +                           34, 34, 34, 34, 34, 34, 34, 34};
> > +  h3 = __extension__(v32hi){34, 34, 34, 34, 34, 34, 34, 34,
> > +                           34, 34, 34, 34, 34, 34, 34, 34,
> > +                           34, 34, 34, 34, 34, 34, 34, 34,
> > +                           34, 34, 34, 34, 34, 34, 34, 34};
> > +}
> > +
> > +void
> > +foo3 ()
> > +{
> > +  s1 = __extension__(v4si){34, 34, 34, 34};
> > +  s2 = __extension__(v8si){34, 34, 34, 34, 34, 34, 34, 34};
> > +  s3 = __extension__(v16si){34, 34, 34, 34, 34, 34, 34, 34,
> > +                           34, 34, 34, 34, 34, 34, 34, 34};
> > +}
> > +
> > +void
> > +foo4 ()
> > +{
> > +  l1 = __extension__(v2di){34, 34};
> > +  l2 = __extension__(v4di){34, 34, 34, 34};
> > +  l3 = __extension__(v8di){34, 34, 34, 34, 34, 34, 34, 34};
> > +}
> > +
> > +void
> > +foo5 ()
> > +{
> > +  f1 = __extension__(v4sf){34, 34, 34, 34};
> > +  f2 = __extension__(v8sf){34, 34, 34, 34, 34, 34, 34, 34};
> > +  f3 = __extension__(v16sf){34, 34, 34, 34, 34, 34, 34, 34,
> > +                           34, 34, 34, 34, 34, 34, 34, 34};
> > +}
> > +
> > +void
> > +foo6 ()
> > +{
> > +  d1 = __extension__(v2df){34, 34};
> > +  d2 = __extension__(v4df){34, 34, 34, 34};
> > +  d3 = __extension__(v8df){34, 34, 34, 34, 34, 34, 34, 34};
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-6.c 
> > b/gcc/testsuite/gcc.target/i386/pr92080-6.c
> > new file mode 100644
> > index 00000000000..e4cdbee55be
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr92080-6.c
> > @@ -0,0 +1,19 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-march=x86-64-v4 -O2" } */
> > +/* { dg-final { scan-assembler-times "vpbroadcastb" 1 } } */
> > +
> > +#include <immintrin.h>
> > +
> > +extern __m512i sinkz;
> > +extern __m256i sinky;
> > +extern char f;
> > +
> > +void
> > +foo(char c, int x)
> > +{
> > +  c += f;
> > +  sinkz = _mm512_set1_epi8(c);
> > +  if (x == 2)
> > +    f += 3;
> > +  sinky = _mm256_set1_epi8(c);
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-7.c 
> > b/gcc/testsuite/gcc.target/i386/pr92080-7.c
> > new file mode 100644
> > index 00000000000..8691684e96b
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr92080-7.c
> > @@ -0,0 +1,20 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-march=x86-64-v4 -O2" } */
> > +/* { dg-final { scan-assembler-times "vpbroadcastb" 1 } } */
> > +
> > +#include <immintrin.h>
> > +
> > +extern __m512i sinkz;
> > +extern __m256i sinky;
> > +extern char f;
> > +extern void bar (void);
> > +
> > +void
> > +foo(char c, int x)
> > +{
> > +  c += f;
> > +  sinkz = _mm512_set1_epi8(c);
> > +  if (x == 2)
> > +    bar ();
> > +  sinky = _mm256_set1_epi8(c);
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-8.c 
> > b/gcc/testsuite/gcc.target/i386/pr92080-8.c
> > new file mode 100644
> > index 00000000000..7ebb62cea75
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr92080-8.c
> > @@ -0,0 +1,16 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-march=x86-64-v4 -O2" } */
> > +/* { dg-final { scan-assembler-times "vpbroadcastd" 1 } } */
> > +/* { dg-final { scan-assembler-times "vpbroadcastq" 1 } } */
> > +
> > +typedef int v4si __attribute__((vector_size(16)));
> > +typedef long long int v2di __attribute__((vector_size(16)));
> > +extern v4si s;
> > +extern v2di l;
> > +
> > +void
> > +foo(void)
> > +{
> > +  l = __extension__(v2di){2,2};
> > +  s = __extension__(v4si){2,2,2,2};
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-9.c 
> > b/gcc/testsuite/gcc.target/i386/pr92080-9.c
> > new file mode 100644
> > index 00000000000..f44ab563f54
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr92080-9.c
> > @@ -0,0 +1,81 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-march=x86-64-v4 -O2" } */
> > +/* { dg-final { scan-assembler-times "vpbroadcastd" 1 } } */
> > +/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]+" 8 } } */
> > +/* { dg-final { scan-assembler-times "vmovdqa64\[\\t \]+" 3 } } */
> > +/* { dg-final { scan-assembler-times "vmovdqa32\[\\t \]+" 1 } } */
> > +
> > +typedef int v4si __attribute__((vector_size(16)));
> > +typedef long long int v2di __attribute__((vector_size(16)));
> > +typedef long long v2di __attribute__((vector_size(16)));
> > +typedef long long v4di __attribute__((vector_size(32)));
> > +typedef long long v8di __attribute__((vector_size(64)));
> > +typedef int v4si __attribute__((vector_size(16)));
> > +typedef int v8si __attribute__((vector_size(32)));
> > +typedef int v16si __attribute__((vector_size(64)));
> > +typedef short v8hi __attribute__((vector_size(16)));
> > +typedef short v16hi __attribute__((vector_size(32)));
> > +typedef short v32hi __attribute__((vector_size(64)));
> > +typedef char v16qi __attribute__((vector_size(16)));
> > +typedef char v32qi __attribute__((vector_size(32)));
> > +typedef char v64qi __attribute__((vector_size(64)));
> > +
> > +extern v16qi b1;
> > +extern v8hi h1;
> > +extern v4si s1;
> > +extern v2di l1;
> > +extern v32qi b2;
> > +extern v16hi h2;
> > +extern v8si s2;
> > +extern v4di l2;
> > +extern v64qi b3;
> > +extern v32hi h3;
> > +extern v16si s3;
> > +extern v8di l3;
> > +
> > +void
> > +foo(void)
> > +{
> > +  b1 = __extension__(v16qi){0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
> > +                           0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22};
> > +  h1 = __extension__(v8hi){0x2222, 0x2222, 0x2222, 0x2222,
> > +                          0x2222, 0x2222, 0x2222, 0x2222};
> > +  s1 = __extension__(v4si){0x22222222,0x22222222,0x22222222,0x22222222};
> > +  l1 = __extension__(v2di){0x2222222222222222ULL,0x2222222222222222ULL};
> > +  b2 = __extension__(v32qi){0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
> > +                           0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
> > +                           0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
> > +                           0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22};
> > +  h2 = __extension__(v16hi){0x2222, 0x2222, 0x2222, 0x2222,
> > +                           0x2222, 0x2222, 0x2222, 0x2222,
> > +                           0x2222, 0x2222, 0x2222, 0x2222,
> > +                           0x2222, 0x2222, 0x2222, 0x2222};
> > +  s2 = __extension__(v8si){0x22222222,0x22222222,0x22222222,0x22222222,
> > +                          0x22222222,0x22222222,0x22222222,0x22222222};
> > +  l2 = __extension__(v4di){0x2222222222222222ULL,0x2222222222222222ULL,
> > +                          0x2222222222222222ULL,0x2222222222222222ULL};
> > +  b3 = __extension__(v64qi){0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
> > +                           0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
> > +                           0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
> > +                           0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
> > +                           0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
> > +                           0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
> > +                           0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
> > +                           0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22};
> > +  h3 = __extension__(v32hi){0x2222, 0x2222, 0x2222, 0x2222,
> > +                           0x2222, 0x2222, 0x2222, 0x2222,
> > +                           0x2222, 0x2222, 0x2222, 0x2222,
> > +                           0x2222, 0x2222, 0x2222, 0x2222,
> > +                           0x2222, 0x2222, 0x2222, 0x2222,
> > +                           0x2222, 0x2222, 0x2222, 0x2222,
> > +                           0x2222, 0x2222, 0x2222, 0x2222,
> > +                           0x2222, 0x2222, 0x2222, 0x2222};
> > +  s3 = __extension__(v16si){0x22222222,0x22222222,0x22222222,0x22222222,
> > +                           0x22222222,0x22222222,0x22222222,0x22222222,
> > +                           0x22222222,0x22222222,0x22222222,0x22222222,
> > +                           0x22222222,0x22222222,0x22222222,0x22222222};
> > +  l3 = __extension__(v8di){0x2222222222222222ULL,0x2222222222222222ULL,
> > +                          0x2222222222222222ULL,0x2222222222222222ULL,
> > +                          0x2222222222222222ULL,0x2222222222222222ULL,
> > +                          0x2222222222222222ULL,0x2222222222222222ULL};
> > +}
> > --
> > 2.49.0
> >
>
>
> --
> BR,
> Hongtao



-- 
H.J.

Reply via email to