On Mon, Jun 23, 2025 at 3:36 PM Hongtao Liu <crazy...@gmail.com> wrote: > > On Thu, Jun 19, 2025 at 10:25 AM H.J. Lu <hjl.to...@gmail.com> wrote: > > > > Extend the remove_redundant_vector pass to handle vector broadcasts from > > constant and variable scalars. When broadcasting from constants and > > function arguments, we can place a single widest vector broadcast at > > entry of the nearest common dominator for basic blocks with all uses > > since constants and function arguments aren't changed. For broadcast > > from variables with a single definition, the single definition is > > replaced with the widest broadcast. > > > > gcc/ > > > > PR target/92080 > > * config/i386/i386-expand.cc (ix86_expand_call): Set > > recursive_function to true for recursive call. > > * config/i386/i386-features.cc (ix86_place_single_vector_set): > > Add an argument for inner scalar, default to nullptr. Set the > > source from inner scalar if not nullptr. > > (ix86_get_vector_load_mode): Renamed to ... > > (ix86_get_vector_cse_mode): This. Add an argument for scalar mode > > and handle integer and float scalar modes. > > (replace_vector_const): Add an argument for scalar mode and pass > > it to ix86_get_vector_load_mode. > > (x86_cse_kind): New. > > (redundant_load): Likewise. > > (ix86_broadcast_inner): Likewise. > > (remove_redundant_vector_load): Also support const0_rtx and > > constm1_rtx broadcasts. Handle vector broadcasts from constant > > and variable scalars. > > * config/i386/i386.h (machine_function): Add recursive_function. > > > > gcc/testsuite/ > > > > * gcc.target/i386/keylocker-aesdecwide128kl.c: Updated to expect > > movdqa instead pxor. > > * gcc.target/i386/keylocker-aesdecwide256kl.c: Likewise. > > * gcc.target/i386/keylocker-aesencwide128kl.c: Likewise. > > * gcc.target/i386/keylocker-aesencwide256kl.c: Likewise. > > * gcc.target/i386/pr92080-4.c: New test. > > * gcc.target/i386/pr92080-5.c: Likewise. > > * gcc.target/i386/pr92080-6.c: Likewise. > > * gcc.target/i386/pr92080-7.c: Likewise. > > * gcc.target/i386/pr92080-8.c: Likewise. > > * gcc.target/i386/pr92080-9.c: Likewise. > > * gcc.target/i386/pr92080-10.c: Likewise. > > * gcc.target/i386/pr92080-11.c: Likewise. > > * gcc.target/i386/pr92080-12.c: Likewise. > > * gcc.target/i386/pr92080-13.c: Likewise. > > * gcc.target/i386/pr92080-14.c: Likewise. > > * gcc.target/i386/pr92080-15.c: Likewise. > > * gcc.target/i386/pr92080-16.c: Likewise. > > > > Signed-off-by: H.J. Lu <hjl.to...@gmail.com> > > --- > > gcc/config/i386/i386-expand.cc | 3 + > > gcc/config/i386/i386-features.cc | 410 ++++++++++++++---- > > gcc/config/i386/i386.h | 3 + > > .../i386/keylocker-aesdecwide128kl.c | 14 +- > > .../i386/keylocker-aesdecwide256kl.c | 14 +- > > .../i386/keylocker-aesencwide128kl.c | 14 +- > > .../i386/keylocker-aesencwide256kl.c | 14 +- > > gcc/testsuite/gcc.target/i386/pr92080-10.c | 13 + > > gcc/testsuite/gcc.target/i386/pr92080-11.c | 33 ++ > > gcc/testsuite/gcc.target/i386/pr92080-12.c | 16 + > > gcc/testsuite/gcc.target/i386/pr92080-13.c | 32 ++ > > gcc/testsuite/gcc.target/i386/pr92080-14.c | 31 ++ > > gcc/testsuite/gcc.target/i386/pr92080-15.c | 25 ++ > > gcc/testsuite/gcc.target/i386/pr92080-16.c | 26 ++ > > gcc/testsuite/gcc.target/i386/pr92080-4.c | 50 +++ > > gcc/testsuite/gcc.target/i386/pr92080-5.c | 109 +++++ > > gcc/testsuite/gcc.target/i386/pr92080-6.c | 19 + > > gcc/testsuite/gcc.target/i386/pr92080-7.c | 20 + > > gcc/testsuite/gcc.target/i386/pr92080-8.c | 16 + > > gcc/testsuite/gcc.target/i386/pr92080-9.c | 81 ++++ > > 20 files changed, 823 insertions(+), 120 deletions(-) > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-10.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-11.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-12.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-13.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-14.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-15.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-16.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-4.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-5.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-6.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-7.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-8.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-9.c > > > > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc > > index 4946f87a131..60c1ac5d2b1 100644 > > --- a/gcc/config/i386/i386-expand.cc > > +++ b/gcc/config/i386/i386-expand.cc > > @@ -10154,6 +10154,9 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx > > callarg1, > > else if (lookup_attribute ("no_callee_saved_registers", > > TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))) > > call_no_callee_saved_registers = true; > > + if (fndecl == current_function_decl > > + && decl_binds_to_current_def_p (fndecl)) > > + cfun->machine->recursive_function = true; > > } > > } > > else > > diff --git a/gcc/config/i386/i386-features.cc > > b/gcc/config/i386/i386-features.cc > > index 56ab7f2d23b..2c49d11da8f 100644 > > --- a/gcc/config/i386/i386-features.cc > > +++ b/gcc/config/i386/i386-features.cc > > @@ -3088,10 +3088,12 @@ ix86_rpad_gate () > > /* Generate a vector set, DEST = SRC, at entry of the nearest dominator > > for basic block map BBS, which is in the fake loop that contains the > > whole function, so that there is only a single vector set in the > > - whole function. */ > > + whole function. If not nullptr, INNER_SCALAR is the inner scalar of > > + SRC, as (reg:SI 99) in (vec_duplicate:V4SI (reg:SI 99)). */ > > > > static void > > -ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs) > > +ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs, > > + rtx inner_scalar = nullptr) > > { > > basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs); > > while (bb->loop_father->latch > > @@ -3112,10 +3114,23 @@ ix86_place_single_vector_set (rtx dest, rtx src, > > bitmap bbs) > > insn = NEXT_INSN (insn); > > } > > > > + rtx_insn *set_insn; > > if (insn == BB_HEAD (bb)) > > - emit_insn_before (set, insn); > > + set_insn = emit_insn_before (set, insn); > > else > > - emit_insn_after (set, insn ? PREV_INSN (insn) : BB_END (bb)); > > + set_insn = emit_insn_after (set, > > + insn ? PREV_INSN (insn) : BB_END (bb)); > > + > > + if (inner_scalar) > > + { > > + /* Set the source in (vec_duplicate:V4SI (reg:SI 99)). */ > > + rtx reg = XEXP (src, 0); > > + if ((REG_P (inner_scalar) || MEM_P (inner_scalar)) > > + && GET_MODE (reg) != GET_MODE (inner_scalar)) > > + inner_scalar = gen_rtx_SUBREG (GET_MODE (reg), inner_scalar, 0); > > + rtx set = gen_rtx_SET (reg, inner_scalar); > > + emit_insn_before (set, set_insn); > > + } > > } > > > > /* At entry of the nearest common dominator for basic blocks with > > @@ -3346,26 +3361,15 @@ make_pass_remove_partial_avx_dependency > > (gcc::context *ctxt) > > return new pass_remove_partial_avx_dependency (ctxt); > > } > > > > -/* Return a machine mode suitable for vector SIZE. */ > > +/* Return a machine mode suitable for vector SIZE with SMODE inner > > + mode. */ > > > > static machine_mode > > -ix86_get_vector_load_mode (unsigned int size) > > +ix86_get_vector_cse_mode (unsigned int size, machine_mode smode) > > { > > - machine_mode mode; > > - if (size == 64) > > - mode = V64QImode; > > - else if (size == 32) > > - mode = V32QImode; > > - else if (size == 16) > > - mode = V16QImode; > > - else if (size == 8) > > - mode = V8QImode; > > - else if (size == 4) > > - mode = V4QImode; > > - else if (size == 2) > > - mode = V2QImode; > > - else > > - gcc_unreachable (); > > + scalar_mode s_mode = as_a <scalar_mode> (smode); > > + poly_uint64 nunits = size / GET_MODE_SIZE (smode); > > + machine_mode mode = mode_for_vector (s_mode, nunits).require (); > > return mode; > > } > > > > @@ -3374,7 +3378,8 @@ ix86_get_vector_load_mode (unsigned int size) > > > > static void > > replace_vector_const (machine_mode vector_mode, rtx vector_const, > > - auto_bitmap &vector_insns) > > + auto_bitmap &vector_insns, > > + machine_mode scalar_mode) > > { > > bitmap_iterator bi; > > unsigned int id; > > @@ -3386,7 +3391,8 @@ replace_vector_const (machine_mode vector_mode, rtx > > vector_const, > > /* Get the single SET instruction. */ > > rtx set = single_set (insn); > > rtx src = SET_SRC (set); > > - machine_mode mode = GET_MODE (src); > > + rtx dest = SET_DEST (set); > > + machine_mode mode = GET_MODE (dest); > > > > rtx replace; > > /* Replace the source operand with VECTOR_CONST. */ > > @@ -3400,7 +3406,8 @@ replace_vector_const (machine_mode vector_mode, rtx > > vector_const, > > /* If the mode size is smaller than its natural size, > > first insert an extra move with a QI vector SUBREG > > of the same size to avoid validate_subreg failure. */ > > - machine_mode vmode = ix86_get_vector_load_mode (size); > > + machine_mode vmode > > + = ix86_get_vector_cse_mode (size, scalar_mode); > > rtx vreg; > > if (mode == vmode) > > vreg = vector_const; > > @@ -3426,6 +3433,169 @@ replace_vector_const (machine_mode vector_mode, rtx > > vector_const, > > } > > } > > > > +enum x86_cse_kind > > +{ > > + X86_CSE_CONST0_VECTOR, > > + X86_CSE_CONSTM1_VECTOR, > > + X86_CSE_VEC_DUP > > +}; > > + > > +struct redundant_load > > +{ > > + /* Bitmap of basic blocks with broadcast instructions. */ > > + auto_bitmap bbs; > > + /* Bitmap of broadcast instructions. */ > > + auto_bitmap insns; > > + /* The broadcast inner scalar. */ > > + rtx val; > > + /* The inner scalar mode. */ > > + machine_mode mode; > > + /* The instruction which sets the inner scalar. Nullptr if the inner > > + scalar is applied to the whole function, instead of within the same > > + block. */ > > + rtx_insn *def_insn; > > + /* The widest broadcast source. */ > > + rtx broadcast_source; > > + /* The widest broadcast register. */ > > + rtx broadcast_reg; > > + /* The basic block of the broadcast instruction. */ > > + basic_block bb; > > + /* The number of broadcast instructions with the same inner scalar. */ > > + unsigned HOST_WIDE_INT count; > > + /* The threshold of broadcast instructions with the same inner > > + scalar. */ > > + unsigned int threshold; > > + /* The widest broadcast size in bytes. */ > > + unsigned int size; > > + /* Load kind. */ > > + x86_cse_kind kind; > > +}; > > + > > +/* Return the inner scalar if OP is a broadcast, else return nullptr. */ > > + > > +static rtx > > +ix86_broadcast_inner (rtx op, machine_mode mode, > > + machine_mode *scalar_mode_p, > > + x86_cse_kind *kind_p, rtx_insn **insn_p) > > +{ > > + if (op == const0_rtx || op == CONST0_RTX (mode)) > > + { > > + *scalar_mode_p = QImode; > > + *kind_p = X86_CSE_CONST0_VECTOR; > > + *insn_p = nullptr; > > + return const0_rtx; > > + } > > + else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT > > + && (op == constm1_rtx || op == CONSTM1_RTX (mode))) > > + { > > + *scalar_mode_p = QImode; > > + *kind_p = X86_CSE_CONSTM1_VECTOR; > > + *insn_p = nullptr; > > + return constm1_rtx; > > + } > > + > > + mode = GET_MODE (op); > > + int nunits = GET_MODE_NUNITS (mode); > > + if (nunits < 2) > > + return nullptr; > > + > > + *kind_p = X86_CSE_VEC_DUP; > > + > > + rtx reg; > > + if (GET_CODE (op) == VEC_DUPLICATE) > > + { > > + /* Only > > + (vec_duplicate:V4SI (reg:SI 99)) > > + (vec_duplicate:V2DF (mem/u/c:DF (symbol_ref/u:DI ("*.LC1") [flags > > 0x2]) [0 S8 A64])) > > + are supported. */ > > + op = XEXP (op, 0); > > + reg = op; > > + if (SUBREG_P (op) > > + && SUBREG_BYTE (op) == 0 > > + && !paradoxical_subreg_p (op)) > > + reg = SUBREG_REG (op); > > + if (!REG_P (reg)) > > + { > > + if (MEM_P (op) > > + && SYMBOL_REF_P (XEXP (op, 0)) > > + && CONSTANT_POOL_ADDRESS_P (XEXP (op, 0))) > > + { > > + /* Handle constant broadcast from memory. */ > > + *scalar_mode_p = GET_MODE_INNER (mode); > > + *insn_p = nullptr; > > + return op; > > + } > > + return nullptr; > > + } > > + } > > + else if (CONST_VECTOR_P (op)) > > + { > > + rtx first = XVECEXP (op, 0, 0); > > + for (int i = 1; i < nunits; ++i) > > + { > > + rtx tmp = XVECEXP (op, 0, i); > > + /* Vector duplicate value. */ > > + if (!rtx_equal_p (tmp, first)) > > + return nullptr; > > + } > > + *scalar_mode_p = GET_MODE (first); > > + *insn_p = nullptr; > > + return first; > > + } > > + else > > + return nullptr; > > + > > + mode = GET_MODE (op); > > + > > + /* Only single def chain is supported. */ > > + df_ref ref = DF_REG_DEF_CHAIN (REGNO (reg)); > > + if (!ref > > + || DF_REF_IS_ARTIFICIAL (ref) > > + || DF_REF_NEXT_REG (ref) != nullptr) > > + return nullptr; > > + > > + rtx_insn *insn = DF_REF_INSN (ref); > > + rtx set = single_set (insn); > > + if (!set) > > + return nullptr; > > + > > + rtx dest = SET_DEST (set); > > + > > + op = SET_SRC (set); > > + /* Set *INSN_P if the scalar source isn't a constant nor an incoming > > + argument. */ > > + if (CONST_INT_P (op) || CONST_DOUBLE_P (op)) > > + *insn_p = nullptr; > > + else if (REG_P (op) > > + && REG_EXPR (op) > > + && TREE_CODE (REG_EXPR (op)) == PARM_DECL) > > + *insn_p = nullptr; > > + else if (MEM_P (op) > > + && MEM_EXPR (op) > > + && TREE_CODE (get_base_address (MEM_EXPR (op))) == PARM_DECL) > > + *insn_p = nullptr; > > + else > > + { > > + while (SUBREG_P (dest)) > > + dest = SUBREG_REG (dest); > > + > > + /* Skip if the SET destination mode doesn't match. */ > > + if (GET_MODE (dest) != mode) > > + return nullptr; > > + > > + /* Set the inner scalar to the SET destination. */ > > + op = dest; > > + *insn_p = insn; > > + } > > + > > + *scalar_mode_p = mode; > > + if (CONSTANT_P (op)) > > + *insn_p = nullptr; > > + else > > + *insn_p = insn; > > + return op; > > +} > > + > > /* At entry of the nearest common dominator for basic blocks with vector > > CONST0_RTX and integer CONSTM1_RTX uses, generate a single widest > > vector set instruction for all CONST0_RTX and integer CONSTM1_RTX > > @@ -3440,20 +3610,16 @@ remove_redundant_vector_load (void) > > { > > timevar_push (TV_MACH_DEP); > > > > - auto_bitmap zero_bbs; > > - auto_bitmap m1_bbs; > > - auto_bitmap zero_insns; > > - auto_bitmap m1_insns; > > - > > + auto_vec<redundant_load *> loads; > > + redundant_load *load; > > basic_block bb; > > rtx_insn *insn; > > - unsigned HOST_WIDE_INT zero_count = 0; > > - unsigned HOST_WIDE_INT m1_count = 0; > > - unsigned int zero_size = 0; > > - unsigned int m1_size = 0; > > + unsigned int i; > > > > df_set_flags (DF_DEFER_INSN_RESCAN); > > > > + bool recursive_call_p = cfun->machine->recursive_function; > > + > > FOR_EACH_BB_FN (bb, cfun) > > { > > FOR_BB_INSNS (bb, insn) > > @@ -3481,79 +3647,139 @@ remove_redundant_vector_load (void) > > if (!REG_P (dest) && !SUBREG_P (dest)) > > continue; > > > > - if (src == CONST0_RTX (mode)) > > - { > > - /* Record vector instruction with CONST0_RTX. */ > > - bitmap_set_bit (zero_insns, INSN_UID (insn)); > > + rtx_insn *def_insn; > > + machine_mode scalar_mode; > > + x86_cse_kind kind; > > + rtx val = ix86_broadcast_inner (src, mode, &scalar_mode, > > + &kind, &def_insn); > > + if (!val) > > + continue; > > > > - /* Record the maximum vector size. */ > > - if (zero_size < GET_MODE_SIZE (mode)) > > - zero_size = GET_MODE_SIZE (mode); > > + /* Remove redundant register loads if there are more than 2 > > + loads will be used. */ > > + unsigned int threshold = 2; > > + > > + /* Check if there is a matching redundant vector load. */ > > + bool matched = false; > > + FOR_EACH_VEC_ELT (loads, i, load) > > + if (load->val > > + && load->kind == kind > > + && load->mode == scalar_mode > > + && (load->bb == bb > > + || kind < X86_CSE_VEC_DUP > > + /* Non all 0s/1s vector load must be in the same > > + basic block if it is in a recursive call. */ > > + || !recursive_call_p) > > + && rtx_equal_p (load->val, val)) > So I guess cse_insn across recursive call make RA different to > allocate with caller_saved registers since they're in the same > register and need to be saved and restored due to ABI? > It may be ok for different calls since those cse_insns can be > allocated with different registers and can be handled well for IPA-RA?
Since callee-saved registers can be used, caller doesn't need to save and restore them around the call. When caller == callee, it becomes expensive. > > + { > > + /* Record vector instruction. */ > > + bitmap_set_bit (load->insns, INSN_UID (insn)); > > > > - /* Record the basic block with CONST0_RTX. */ > > - bitmap_set_bit (zero_bbs, bb->index); > > - zero_count++; > > - } > > - else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT > > - && src == CONSTM1_RTX (mode)) > > - { > > - /* Record vector instruction with CONSTM1_RTX. */ > > - bitmap_set_bit (m1_insns, INSN_UID (insn)); > > + /* Record the maximum vector size. */ > > + if (load->size < GET_MODE_SIZE (mode)) > > + load->size = GET_MODE_SIZE (mode); > > > > - /* Record the maximum vector size. */ > > - if (m1_size < GET_MODE_SIZE (mode)) > > - m1_size = GET_MODE_SIZE (mode); > > + /* Record the basic block. */ > > + bitmap_set_bit (load->bbs, bb->index); > > + load->count++; > > + matched = true; > > + break; > > + } > > > > - /* Record the basic block with CONSTM1_RTX. */ > > - bitmap_set_bit (m1_bbs, bb->index); > > - m1_count++; > > - } > > - } > > - } > > + if (matched) > > + continue; > > > > - if (zero_count > 1 || m1_count > 1) > > - { > > - machine_mode zero_mode, m1_mode; > > - rtx vector_const0, vector_constm1; > > + /* We see this vector broadcast the first time. */ > > + load = new redundant_load; > > > > - if (zero_count > 1) > > - { > > - zero_mode = ix86_get_vector_load_mode (zero_size); > > - vector_const0 = gen_reg_rtx (zero_mode); > > - replace_vector_const (zero_mode, vector_const0, zero_insns); > > - } > > - else > > - { > > - zero_mode = VOIDmode; > > - vector_const0 = nullptr; > > - } > > + load->val = copy_rtx (val); > > + load->mode = scalar_mode; > > + load->size = GET_MODE_SIZE (mode); > > + load->def_insn = def_insn; > > + load->count = 1; > > + load->threshold = threshold; > > + load->bb = BLOCK_FOR_INSN (insn); > > + load->kind = kind; > > > > - if (m1_count > 1) > > - { > > - m1_mode = ix86_get_vector_load_mode (m1_size); > > - vector_constm1 = gen_reg_rtx (m1_mode); > > - replace_vector_const (m1_mode, vector_constm1, m1_insns); > > - } > > - else > > - { > > - m1_mode = VOIDmode; > > - vector_constm1 = nullptr; > > + bitmap_set_bit (load->insns, INSN_UID (insn)); > > + bitmap_set_bit (load->bbs, bb->index); > > + > > + loads.safe_push (load); > > } > > + } > > > > + bool replaced = false; > > + rtx reg, broadcast_source, broadcast_reg; > > + FOR_EACH_VEC_ELT (loads, i, load) > > + if (load->count >= load->threshold) > > + { > > + machine_mode mode = ix86_get_vector_cse_mode (load->size, > > + load->mode); > > + broadcast_reg = gen_reg_rtx (mode); > > + if (load->def_insn) > > + { > > + /* Replace redundant vector loads with a single vector load > > + in the same basic block. */ > > + reg = load->val; > > + if (load->mode != GET_MODE (reg)) > > + reg = gen_rtx_SUBREG (load->mode, reg, 0); > > + broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg); > > + replace_vector_const (mode, broadcast_reg, load->insns, > > + load->mode); > > + } > > + else > > + { > > + /* This is a constant integer/double vector. If the > > + inner scalar is 0 or -1, set vector to CONST0_RTX > > + or CONSTM1_RTX directly. */ > > + rtx reg; > > + switch (load->kind) > > + { > > + case X86_CSE_CONST0_VECTOR: > > + broadcast_source = CONST0_RTX (mode); > > + break; > > + case X86_CSE_CONSTM1_VECTOR: > > + broadcast_source = CONSTM1_RTX (mode); > > + break; > > + default: > > + reg = gen_reg_rtx (load->mode); > > + broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg); > > + break; > > + } > > + replace_vector_const (mode, broadcast_reg, load->insns, > > + load->mode); > > + } > > + load->broadcast_source = broadcast_source; > > + load->broadcast_reg = broadcast_reg; > > + replaced = true; > > + } > > + > > + if (replaced) > > + { > > /* (Re-)discover loops so that bb->loop_father can be used in the > > analysis below. */ > > calculate_dominance_info (CDI_DOMINATORS); > > loop_optimizer_init (AVOID_CFG_MODIFICATIONS); > > > > - if (vector_const0) > > - ix86_place_single_vector_set (vector_const0, > > - CONST0_RTX (zero_mode), > > - zero_bbs); > > - > > - if (vector_constm1) > > - ix86_place_single_vector_set (vector_constm1, > > - CONSTM1_RTX (m1_mode), > > - m1_bbs); > > + FOR_EACH_VEC_ELT (loads, i, load) > > + if (load->count >= load->threshold) > > + { > > + if (load->def_insn) > > + { > > + /* Insert a broadcast after the original scalar > > + definition. */ > > + rtx set = gen_rtx_SET (load->broadcast_reg, > > + load->broadcast_source); > > + insn = emit_insn_after (set, load->def_insn); > > + } > > + else > > + ix86_place_single_vector_set (load->broadcast_reg, > > + load->broadcast_source, > > + load->bbs, > > + (load->kind == X86_CSE_VEC_DUP > > + ? load->val > > + : nullptr)); > > + } > > > > loop_optimizer_finalize (); > > > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > > index 7c16eac7700..812055085bb 100644 > > --- a/gcc/config/i386/i386.h > > +++ b/gcc/config/i386/i386.h > > @@ -2924,6 +2924,9 @@ struct GTY(()) machine_function { > > /* True if inline asm with redzone clobber has been seen. */ > > BOOL_BITFIELD asm_redzone_clobber_seen : 1; > > > > + /* True if this is a recursive function. */ > > + BOOL_BITFIELD recursive_function : 1; > > + > > /* The largest alignment, in bytes, of stack slot actually used. */ > > unsigned int max_used_stack_alignment; > > > > diff --git a/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide128kl.c > > b/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide128kl.c > > index 93806e51508..e73ba35ddd1 100644 > > --- a/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide128kl.c > > +++ b/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide128kl.c > > @@ -19,14 +19,14 @@ > > /* { dg-final { scan-assembler "(?:movdqu|movups)\[ > > \\t\]+\[^\\n\\r\]*%xmm5,\[^\\n\\r\]*80\[^\\n\\r\]*" } } */ > > /* { dg-final { scan-assembler "(?:movdqu|movups)\[ > > \\t\]+\[^\\n\\r\]*%xmm6,\[^\\n\\r\]*96\[^\\n\\r\]*" } } */ > > /* { dg-final { scan-assembler "(?:movdqu|movups)\[ > > \\t\]+\[^\\n\\r\]*%xmm7,\[^\\n\\r\]*112\[^\\n\\r\]*" } } */ > > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm0, %xmm0" } } */ > > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm1, %xmm1" } } */ > > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm2, %xmm2" } } */ > > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm3, %xmm3" } } */ > > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm4, %xmm4" } } */ > > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm5, %xmm5" } } */ > > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm6, %xmm6" } } */ > > /* { dg-final { scan-assembler "pxor\[ \t\]+%xmm7, %xmm7" } } */ > > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm0" } } */ > > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm1" } } */ > > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm2" } } */ > > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm3" } } */ > > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm4" } } */ > > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm5" } } */ > > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm6" } } */ > > > > #include <immintrin.h> > > > > diff --git a/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide256kl.c > > b/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide256kl.c > > index f9ccc82c7ca..33cd998bfdf 100644 > > --- a/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide256kl.c > > +++ b/gcc/testsuite/gcc.target/i386/keylocker-aesdecwide256kl.c > > @@ -19,14 +19,14 @@ > > /* { dg-final { scan-assembler "(?:movdqu|movups)\[ > > \\t\]+\[^\\n\\r\]*%xmm5,\[^\\n\\r\]*80\[^\\n\\r\]*" } } */ > > /* { dg-final { scan-assembler "(?:movdqu|movups)\[ > > \\t\]+\[^\\n\\r\]*%xmm6,\[^\\n\\r\]*96\[^\\n\\r\]*" } } */ > > /* { dg-final { scan-assembler "(?:movdqu|movups)\[ > > \\t\]+\[^\\n\\r\]*%xmm7,\[^\\n\\r\]*112\[^\\n\\r\]*" } } */ > > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm0, %xmm0" } } */ > > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm1, %xmm1" } } */ > > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm2, %xmm2" } } */ > > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm3, %xmm3" } } */ > > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm4, %xmm4" } } */ > > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm5, %xmm5" } } */ > > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm6, %xmm6" } } */ > > /* { dg-final { scan-assembler "pxor\[ \t\]+%xmm7, %xmm7" } } */ > > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm0" } } */ > > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm1" } } */ > > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm2" } } */ > > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm3" } } */ > > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm4" } } */ > > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm5" } } */ > > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm6" } } */ > > > > #include <immintrin.h> > > > > diff --git a/gcc/testsuite/gcc.target/i386/keylocker-aesencwide128kl.c > > b/gcc/testsuite/gcc.target/i386/keylocker-aesencwide128kl.c > > index c0fcd28fb07..75106e59b77 100644 > > --- a/gcc/testsuite/gcc.target/i386/keylocker-aesencwide128kl.c > > +++ b/gcc/testsuite/gcc.target/i386/keylocker-aesencwide128kl.c > > @@ -19,14 +19,14 @@ > > /* { dg-final { scan-assembler "(?:movdqu|movups)\[ > > \\t\]+\[^\\n\\r\]*%xmm5,\[^\\n\\r\]*80\[^\\n\\r\]*" } } */ > > /* { dg-final { scan-assembler "(?:movdqu|movups)\[ > > \\t\]+\[^\\n\\r\]*%xmm6,\[^\\n\\r\]*96\[^\\n\\r\]*" } } */ > > /* { dg-final { scan-assembler "(?:movdqu|movups)\[ > > \\t\]+\[^\\n\\r\]*%xmm7,\[^\\n\\r\]*112\[^\\n\\r\]*" } } */ > > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm0, %xmm0" } } */ > > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm1, %xmm1" } } */ > > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm2, %xmm2" } } */ > > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm3, %xmm3" } } */ > > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm4, %xmm4" } } */ > > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm5, %xmm5" } } */ > > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm6, %xmm6" } } */ > > /* { dg-final { scan-assembler "pxor\[ \t\]+%xmm7, %xmm7" } } */ > > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm0" } } */ > > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm1" } } */ > > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm2" } } */ > > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm3" } } */ > > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm4" } } */ > > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm5" } } */ > > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm6" } } */ > > > > #include <immintrin.h> > > > > diff --git a/gcc/testsuite/gcc.target/i386/keylocker-aesencwide256kl.c > > b/gcc/testsuite/gcc.target/i386/keylocker-aesencwide256kl.c > > index 31463a8b2da..2787732229a 100644 > > --- a/gcc/testsuite/gcc.target/i386/keylocker-aesencwide256kl.c > > +++ b/gcc/testsuite/gcc.target/i386/keylocker-aesencwide256kl.c > > @@ -19,14 +19,14 @@ > > /* { dg-final { scan-assembler "(?:movdqu|movups)\[ > > \\t\]+\[^\\n\\r\]*%xmm5,\[^\\n\\r\]*80\[^\\n\\r\]*" } } */ > > /* { dg-final { scan-assembler "(?:movdqu|movups)\[ > > \\t\]+\[^\\n\\r\]*%xmm6,\[^\\n\\r\]*96\[^\\n\\r\]*" } } */ > > /* { dg-final { scan-assembler "(?:movdqu|movups)\[ > > \\t\]+\[^\\n\\r\]*%xmm7,\[^\\n\\r\]*112\[^\\n\\r\]*" } } */ > > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm0, %xmm0" } } */ > > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm1, %xmm1" } } */ > > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm2, %xmm2" } } */ > > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm3, %xmm3" } } */ > > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm4, %xmm4" } } */ > > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm5, %xmm5" } } */ > > -/* { dg-final { scan-assembler "pxor\[ \t\]+%xmm6, %xmm6" } } */ > > /* { dg-final { scan-assembler "pxor\[ \t\]+%xmm7, %xmm7" } } */ > > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm0" } } */ > > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm1" } } */ > > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm2" } } */ > > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm3" } } */ > > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm4" } } */ > > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm5" } } */ > > +/* { dg-final { scan-assembler "movdqa\[ \t\]+%xmm7, %xmm6" } } */ > > > > #include <immintrin.h> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-10.c > > b/gcc/testsuite/gcc.target/i386/pr92080-10.c > > new file mode 100644 > > index 00000000000..b67f9d8d285 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr92080-10.c > > @@ -0,0 +1,13 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-march=sapphirerapids -Ofast" } */ > > +/* { dg-final { scan-assembler-times "vpbroadcastw" 1 } } */ > > + > > +extern short write_picture_p_Vid_0; > > +extern unsigned short *write_picture_p_2_0_0; > > +extern int write_picture_p_0, write_picture_p_1, write_picture_i; > > +void write_picture() { > > + unsigned short cr_val = 1 << write_picture_p_Vid_0; > > + for (; write_picture_p_1;) > > + for (; write_picture_i < write_picture_p_0; write_picture_i++) > > + write_picture_p_2_0_0[write_picture_i] = cr_val; > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-11.c > > b/gcc/testsuite/gcc.target/i386/pr92080-11.c > > new file mode 100644 > > index 00000000000..8747fc47640 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr92080-11.c > > @@ -0,0 +1,33 @@ > > +/* { dg-do run { target { avx512f_runtime } } } */ > > +/* { dg-options "-mavx512f -mtune=icelake-server -O3" } */ > > + > > +struct s { > > + char s[sizeof(long double)]; > > +}; > > + > > +union u { > > + long double d; > > + struct s s; > > +}; > > + > > +int main() > > +{ > > + union u x = {0}; > > +#if __SIZEOF_LONG_DOUBLE__ == 16 > > + x.s = (struct s){"xxxxxxxxxxxxxxxx"}; > > +#elif __SIZEOF_LONG_DOUBLE__ == 12 > > + x.s = (struct s){"xxxxxxxxxxxx"}; > > +#elif __SIZEOF_LONG_DOUBLE__ == 8 > > + x.s = (struct s){"xxxxxxxx"}; > > +#elif __SIZEOF_LONG_DOUBLE__ == 4 > > + x.s = (struct s){"xxxx"}; > > +#endif > > + > > + union u y = x; > > + > > + for (unsigned char *p = (unsigned char *)&y + sizeof y; > > + p-- > (unsigned char *)&y;) > > + if (*p != (unsigned char)'x') > > + __builtin_abort (); > > + return 0; > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-12.c > > b/gcc/testsuite/gcc.target/i386/pr92080-12.c > > new file mode 100644 > > index 00000000000..cb09eb2f0a8 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr92080-12.c > > @@ -0,0 +1,16 @@ > > +/* { dg-do compile } */ > > +/* { dg-additional-options "-O3 -mno-mmx -march=icelake-server" } */ > > +/* { dg-final { scan-assembler-times "vpbroadcastb" 1 } } */ > > + > > +signed char a; > > +signed char f (int i, int j) > > +{ > > + signed char c; > > + while (i != 0) > > + { > > + a ^= j; > > + ++c; > > + ++i; > > + } > > + return c; > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-13.c > > b/gcc/testsuite/gcc.target/i386/pr92080-13.c > > new file mode 100644 > > index 00000000000..24b7616c894 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr92080-13.c > > @@ -0,0 +1,32 @@ > > +/* { dg-do run { target { avx512f_runtime } } } */ > > +/* { dg-options "-mavx512f -mtune=icelake-server -O2 -save-temps" } */ > > +/* { dg-final { scan-assembler-times "vpbroadcastd" 2 } } */ > > + > > +#include <assert.h> > > + > > +#define CONTAINER_KIND union > > + > > +typedef CONTAINER_KIND container { int value; } container; > > + > > +void move(container* end, container* start) { > > + container* p; > > + for (p = end; p > start; p--) { > > + (p)->value = (p-1)->value; > > + } > > +} > > + > > +#define N 100 > > + > > +int main(int argc, char* argv[]) { > > + container vals[N]; > > + int i; > > + for (i=0; i<N; i++) { > > + vals[i].value = argc + i; > > + } > > + move(&vals[N-1], &vals[0]); > > + assert(vals[0].value == argc + 0); > > + for (i=1; i<N; i++) { > > + assert(vals[i].value == argc + i - 1); > > + } > > + return 0; > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-14.c > > b/gcc/testsuite/gcc.target/i386/pr92080-14.c > > new file mode 100644 > > index 00000000000..6be41b63400 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr92080-14.c > > @@ -0,0 +1,31 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-march=x86-64-v4 -O2" } */ > > +/* { dg-final { scan-assembler-times "vpbroadcastd" 1 } } */ > > + > > +typedef int v16si __attribute__((vector_size(64))); > > +typedef int v8si __attribute__((vector_size(32))); > > +typedef int v4si __attribute__((vector_size(16))); > > + > > +extern v16si sinksz; > > +extern v8si sinksy; > > +extern v4si sinksx; > > +extern v4si sinksx1; > > + > > +extern void bar (void); > > + > > +void > > +foo (char c, int i) > > +{ > > + sinksz = __extension__(v16si){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c}; > > + if (i == 1) > > + { > > + sinksy = __extension__(v8si){c,c,c,c,c,c,c,c}; > > + bar (); > > + } > > + else if (i == 2) > > + { > > + sinksx = __extension__(v4si){c,c,c,c}; > > + bar (); > > + } > > + sinksx1 = __extension__(v4si){c,c,c,c}; > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-15.c > > b/gcc/testsuite/gcc.target/i386/pr92080-15.c > > new file mode 100644 > > index 00000000000..fa55d82e48e > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr92080-15.c > > @@ -0,0 +1,25 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O1 -march=x86-64-v4" } */ > > +/* { dg-final { scan-assembler-times "vpbroadcastd" 3 } } */ > > + > > +typedef int v4si __attribute__((vector_size(16))); > > +typedef int v8si __attribute__((vector_size(32))); > > +typedef int v16si __attribute__((vector_size(64))); > > + > > +extern v4si *s1; > > +extern v8si *s2; > > +extern v16si *s3; > > + > > +int > > +foo (int i, int j) > > +{ > > + if (j == 1) > > + s1[i] = __extension__(v4si){34, 34, 34, 34}; > > + else if (i == 1) > > + s2[j] = __extension__(v8si){34, 34, 34, 34, 34, 34, 34, 34}; > > + if ((i + j) == 1234) > > + i = foo (j, i); > > + s3[i + j] = __extension__(v16si){34, 34, 34, 34, 34, 34, 34, 34, > > + 34, 34, 34, 34, 34, 34, 34, 34}; > > + return i - j; > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-16.c > > b/gcc/testsuite/gcc.target/i386/pr92080-16.c > > new file mode 100644 > > index 00000000000..c8ab084b714 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr92080-16.c > > @@ -0,0 +1,26 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O1 -march=x86-64-v4" } */ > > +/* { dg-final { scan-assembler-times "vpbroadcastd" 1 } } */ > > + > > +typedef int v4si __attribute__((vector_size(16))); > > +typedef int v8si __attribute__((vector_size(32))); > > +typedef int v16si __attribute__((vector_size(64))); > > + > > +extern v4si *s1; > > +extern v8si *s2; > > +extern v16si *s3; > > + > > +int > > +foo (int i, int j) > > +{ > > + if (j == 1) > > + { > > + s1[i] = __extension__(v4si){34, 34, 34, 34}; > > + s2[j] = __extension__(v8si){34, 34, 34, 34, 34, 34, 34, 34}; > > + s3[i + j] = __extension__(v16si){34, 34, 34, 34, 34, 34, 34, 34, > > + 34, 34, 34, 34, 34, 34, 34, 34}; > > + } > > + if ((i + j) == 1234) > > + i = foo (j, i); > > + return i - j; > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-4.c > > b/gcc/testsuite/gcc.target/i386/pr92080-4.c > > new file mode 100644 > > index 00000000000..ebe1384c691 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr92080-4.c > > @@ -0,0 +1,50 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-march=x86-64-v4 -O2" } */ > > +/* { dg-final { scan-assembler-times "vpbroadcastb" 1 } } */ > > +/* { dg-final { scan-assembler-times "vpbroadcastd" 1 } } */ > > +/* { dg-final { scan-assembler-times "vpbroadcastw" 1 } } */ > > + > > +typedef int v16si __attribute__((vector_size(64))); > > +typedef int v8si __attribute__((vector_size(32))); > > +typedef int v4si __attribute__((vector_size(16))); > > + > > +typedef short v32hi __attribute__((vector_size(64))); > > +typedef short v16hi __attribute__((vector_size(32))); > > +typedef short v8hi __attribute__((vector_size(16))); > > + > > +typedef char v64qi __attribute__((vector_size(64))); > > +typedef char v32qi __attribute__((vector_size(32))); > > +typedef char v16qi __attribute__((vector_size(16))); > > + > > +extern v16si sinksz; > > +extern v8si sinksy; > > +extern v4si sinksx; > > +extern v32hi sinkhz; > > +extern v16hi sinkhy; > > +extern v8hi sinkhx; > > +extern v64qi sinkbz; > > +extern v32qi sinkby; > > +extern v16qi sinkbx; > > + > > +void foo(char c) { > > + sinksz = __extension__(v16si){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c}; > > + sinksy = __extension__(v8si){c,c,c,c,c,c,c,c}; > > + sinksx = __extension__(v4si){c,c,c,c}; > > +} > > + > > +void foo1(char c) { > > + sinkhz = __extension__(v32hi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c, > > + c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c}; > > + sinkhy = __extension__(v16hi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c}; > > + sinkhx = __extension__(v8hi){c,c,c,c,c,c,c,c}; > > +} > > + > > +void foo2(char c) { > > + sinkbz = __extension__(v64qi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c, > > + c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c, > > + c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c, > > + c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c}; > > + sinkby = __extension__(v32qi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c, > > + c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c}; > > + sinkbx = __extension__(v16qi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c}; > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-5.c > > b/gcc/testsuite/gcc.target/i386/pr92080-5.c > > new file mode 100644 > > index 00000000000..380cd337e09 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr92080-5.c > > @@ -0,0 +1,109 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -march=x86-64-v4" } */ > > +/* { dg-final { scan-assembler-times "vpbroadcastd" 3 } } */ > > +/* { dg-final { scan-assembler-times "vpbroadcastq" 1 } } */ > > +/* { dg-final { scan-assembler-times "vbroadcastsd" 1 } } */ > > +/* { dg-final { scan-assembler-times "vbroadcastss" 1 } } */ > > + > > +typedef long long v2di __attribute__((vector_size(16))); > > +typedef long long v4di __attribute__((vector_size(32))); > > +typedef long long v8di __attribute__((vector_size(64))); > > +typedef int v4si __attribute__((vector_size(16))); > > +typedef int v8si __attribute__((vector_size(32))); > > +typedef int v16si __attribute__((vector_size(64))); > > +typedef short v8hi __attribute__((vector_size(16))); > > +typedef short v16hi __attribute__((vector_size(32))); > > +typedef short v32hi __attribute__((vector_size(64))); > > +typedef char v16qi __attribute__((vector_size(16))); > > +typedef char v32qi __attribute__((vector_size(32))); > > +typedef char v64qi __attribute__((vector_size(64))); > > +typedef float v4sf __attribute__((vector_size(16))); > > +typedef float v8sf __attribute__((vector_size(32))); > > +typedef float v16sf __attribute__((vector_size(64))); > > +typedef double v2df __attribute__((vector_size(16))); > > +typedef double v4df __attribute__((vector_size(32))); > > +typedef double v8df __attribute__((vector_size(64))); > > + > > +extern v16qi b1; > > +extern v8hi h1; > > +extern v4si s1; > > +extern v2di l1; > > +extern v4sf f1; > > +extern v2df d1; > > +extern v32qi b2; > > +extern v16hi h2; > > +extern v8si s2; > > +extern v4di l2; > > +extern v8sf f2; > > +extern v4df d2; > > +extern v64qi b3; > > +extern v32hi h3; > > +extern v16si s3; > > +extern v8di l3; > > +extern v16sf f3; > > +extern v8df d3; > > + > > +void > > +foo1 () > > +{ > > + b1 = __extension__(v16qi){34, 34, 34, 34, 34, 34, 34, 34, > > + 34, 34, 34, 34, 34, 34, 34, 34}; > > + b2 = __extension__(v32qi){34, 34, 34, 34, 34, 34, 34, 34, > > + 34, 34, 34, 34, 34, 34, 34, 34, > > + 34, 34, 34, 34, 34, 34, 34, 34, > > + 34, 34, 34, 34, 34, 34, 34, 34}; > > + b3 = __extension__(v64qi){34, 34, 34, 34, 34, 34, 34, 34, > > + 34, 34, 34, 34, 34, 34, 34, 34, > > + 34, 34, 34, 34, 34, 34, 34, 34, > > + 34, 34, 34, 34, 34, 34, 34, 34, > > + 34, 34, 34, 34, 34, 34, 34, 34, > > + 34, 34, 34, 34, 34, 34, 34, 34, > > + 34, 34, 34, 34, 34, 34, 34, 34, > > + 34, 34, 34, 34, 34, 34, 34, 34}; > > +} > > + > > +void > > +foo2 () > > +{ > > + h1 = __extension__(v8hi){34, 34, 34, 34, 34, 34, 34, 34}; > > + h2 = __extension__(v16hi){34, 34, 34, 34, 34, 34, 34, 34, > > + 34, 34, 34, 34, 34, 34, 34, 34}; > > + h3 = __extension__(v32hi){34, 34, 34, 34, 34, 34, 34, 34, > > + 34, 34, 34, 34, 34, 34, 34, 34, > > + 34, 34, 34, 34, 34, 34, 34, 34, > > + 34, 34, 34, 34, 34, 34, 34, 34}; > > +} > > + > > +void > > +foo3 () > > +{ > > + s1 = __extension__(v4si){34, 34, 34, 34}; > > + s2 = __extension__(v8si){34, 34, 34, 34, 34, 34, 34, 34}; > > + s3 = __extension__(v16si){34, 34, 34, 34, 34, 34, 34, 34, > > + 34, 34, 34, 34, 34, 34, 34, 34}; > > +} > > + > > +void > > +foo4 () > > +{ > > + l1 = __extension__(v2di){34, 34}; > > + l2 = __extension__(v4di){34, 34, 34, 34}; > > + l3 = __extension__(v8di){34, 34, 34, 34, 34, 34, 34, 34}; > > +} > > + > > +void > > +foo5 () > > +{ > > + f1 = __extension__(v4sf){34, 34, 34, 34}; > > + f2 = __extension__(v8sf){34, 34, 34, 34, 34, 34, 34, 34}; > > + f3 = __extension__(v16sf){34, 34, 34, 34, 34, 34, 34, 34, > > + 34, 34, 34, 34, 34, 34, 34, 34}; > > +} > > + > > +void > > +foo6 () > > +{ > > + d1 = __extension__(v2df){34, 34}; > > + d2 = __extension__(v4df){34, 34, 34, 34}; > > + d3 = __extension__(v8df){34, 34, 34, 34, 34, 34, 34, 34}; > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-6.c > > b/gcc/testsuite/gcc.target/i386/pr92080-6.c > > new file mode 100644 > > index 00000000000..e4cdbee55be > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr92080-6.c > > @@ -0,0 +1,19 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-march=x86-64-v4 -O2" } */ > > +/* { dg-final { scan-assembler-times "vpbroadcastb" 1 } } */ > > + > > +#include <immintrin.h> > > + > > +extern __m512i sinkz; > > +extern __m256i sinky; > > +extern char f; > > + > > +void > > +foo(char c, int x) > > +{ > > + c += f; > > + sinkz = _mm512_set1_epi8(c); > > + if (x == 2) > > + f += 3; > > + sinky = _mm256_set1_epi8(c); > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-7.c > > b/gcc/testsuite/gcc.target/i386/pr92080-7.c > > new file mode 100644 > > index 00000000000..8691684e96b > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr92080-7.c > > @@ -0,0 +1,20 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-march=x86-64-v4 -O2" } */ > > +/* { dg-final { scan-assembler-times "vpbroadcastb" 1 } } */ > > + > > +#include <immintrin.h> > > + > > +extern __m512i sinkz; > > +extern __m256i sinky; > > +extern char f; > > +extern void bar (void); > > + > > +void > > +foo(char c, int x) > > +{ > > + c += f; > > + sinkz = _mm512_set1_epi8(c); > > + if (x == 2) > > + bar (); > > + sinky = _mm256_set1_epi8(c); > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-8.c > > b/gcc/testsuite/gcc.target/i386/pr92080-8.c > > new file mode 100644 > > index 00000000000..7ebb62cea75 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr92080-8.c > > @@ -0,0 +1,16 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-march=x86-64-v4 -O2" } */ > > +/* { dg-final { scan-assembler-times "vpbroadcastd" 1 } } */ > > +/* { dg-final { scan-assembler-times "vpbroadcastq" 1 } } */ > > + > > +typedef int v4si __attribute__((vector_size(16))); > > +typedef long long int v2di __attribute__((vector_size(16))); > > +extern v4si s; > > +extern v2di l; > > + > > +void > > +foo(void) > > +{ > > + l = __extension__(v2di){2,2}; > > + s = __extension__(v4si){2,2,2,2}; > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr92080-9.c > > b/gcc/testsuite/gcc.target/i386/pr92080-9.c > > new file mode 100644 > > index 00000000000..f44ab563f54 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr92080-9.c > > @@ -0,0 +1,81 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-march=x86-64-v4 -O2" } */ > > +/* { dg-final { scan-assembler-times "vpbroadcastd" 1 } } */ > > +/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]+" 8 } } */ > > +/* { dg-final { scan-assembler-times "vmovdqa64\[\\t \]+" 3 } } */ > > +/* { dg-final { scan-assembler-times "vmovdqa32\[\\t \]+" 1 } } */ > > + > > +typedef int v4si __attribute__((vector_size(16))); > > +typedef long long int v2di __attribute__((vector_size(16))); > > +typedef long long v2di __attribute__((vector_size(16))); > > +typedef long long v4di __attribute__((vector_size(32))); > > +typedef long long v8di __attribute__((vector_size(64))); > > +typedef int v4si __attribute__((vector_size(16))); > > +typedef int v8si __attribute__((vector_size(32))); > > +typedef int v16si __attribute__((vector_size(64))); > > +typedef short v8hi __attribute__((vector_size(16))); > > +typedef short v16hi __attribute__((vector_size(32))); > > +typedef short v32hi __attribute__((vector_size(64))); > > +typedef char v16qi __attribute__((vector_size(16))); > > +typedef char v32qi __attribute__((vector_size(32))); > > +typedef char v64qi __attribute__((vector_size(64))); > > + > > +extern v16qi b1; > > +extern v8hi h1; > > +extern v4si s1; > > +extern v2di l1; > > +extern v32qi b2; > > +extern v16hi h2; > > +extern v8si s2; > > +extern v4di l2; > > +extern v64qi b3; > > +extern v32hi h3; > > +extern v16si s3; > > +extern v8di l3; > > + > > +void > > +foo(void) > > +{ > > + b1 = __extension__(v16qi){0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, > > + 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22}; > > + h1 = __extension__(v8hi){0x2222, 0x2222, 0x2222, 0x2222, > > + 0x2222, 0x2222, 0x2222, 0x2222}; > > + s1 = __extension__(v4si){0x22222222,0x22222222,0x22222222,0x22222222}; > > + l1 = __extension__(v2di){0x2222222222222222ULL,0x2222222222222222ULL}; > > + b2 = __extension__(v32qi){0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, > > + 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, > > + 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, > > + 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22}; > > + h2 = __extension__(v16hi){0x2222, 0x2222, 0x2222, 0x2222, > > + 0x2222, 0x2222, 0x2222, 0x2222, > > + 0x2222, 0x2222, 0x2222, 0x2222, > > + 0x2222, 0x2222, 0x2222, 0x2222}; > > + s2 = __extension__(v8si){0x22222222,0x22222222,0x22222222,0x22222222, > > + 0x22222222,0x22222222,0x22222222,0x22222222}; > > + l2 = __extension__(v4di){0x2222222222222222ULL,0x2222222222222222ULL, > > + 0x2222222222222222ULL,0x2222222222222222ULL}; > > + b3 = __extension__(v64qi){0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, > > + 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, > > + 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, > > + 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, > > + 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, > > + 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, > > + 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, > > + 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22}; > > + h3 = __extension__(v32hi){0x2222, 0x2222, 0x2222, 0x2222, > > + 0x2222, 0x2222, 0x2222, 0x2222, > > + 0x2222, 0x2222, 0x2222, 0x2222, > > + 0x2222, 0x2222, 0x2222, 0x2222, > > + 0x2222, 0x2222, 0x2222, 0x2222, > > + 0x2222, 0x2222, 0x2222, 0x2222, > > + 0x2222, 0x2222, 0x2222, 0x2222, > > + 0x2222, 0x2222, 0x2222, 0x2222}; > > + s3 = __extension__(v16si){0x22222222,0x22222222,0x22222222,0x22222222, > > + 0x22222222,0x22222222,0x22222222,0x22222222, > > + 0x22222222,0x22222222,0x22222222,0x22222222, > > + 0x22222222,0x22222222,0x22222222,0x22222222}; > > + l3 = __extension__(v8di){0x2222222222222222ULL,0x2222222222222222ULL, > > + 0x2222222222222222ULL,0x2222222222222222ULL, > > + 0x2222222222222222ULL,0x2222222222222222ULL, > > + 0x2222222222222222ULL,0x2222222222222222ULL}; > > +} > > -- > > 2.49.0 > > > > > -- > BR, > Hongtao -- H.J.