On Mon, Jun 23, 2025 at 2:24 PM H.J. Lu <hjl.to...@gmail.com> wrote:
>
> On Wed, Jun 18, 2025 at 3:17 PM H.J. Lu <hjl.to...@gmail.com> wrote:
> >
> > 1. Don't generate the loop if the loop count is 1.
> > 2. For memset with vector on small size, use vector if small size supports
> > vector, otherwise use the scalar value.
> > 3. Duplicate the promoted scalar value for vector.
> > 4. Always expand vector-version of memset for vector_loop.
> > 5. Use misaligned prologue if alignment isn't needed.  When misaligned
> > prologue is used, check if destination is actually aligned and update
> > destination alignment if aligned.
> >
> > The included tests show that codegen of vector_loop/unrolled_loop for
> > memset/memcpy are significantly improved.  For
> >
> > ---
> > void
> > foo (void *p1, size_t len)
> > {
> >   __builtin_memset (p1, 0, len);
> > }
> > ---
> >
> > with
> >
> > -O2 -minline-all-stringops 
> > -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -march=x86-64
> >
> > we used to generate
> >
> > foo:
> > .LFB0:
> >         .cfi_startproc
> >         movq    %rdi, %rax
> >         pxor    %xmm0, %xmm0
> >         cmpq    $64, %rsi
> >         jnb     .L18
> > .L2:
> >         andl    $63, %esi
> >         je      .L1
> >         xorl    %edx, %edx
> >         testb   $1, %sil
> >         je      .L5
> >         movl    $1, %edx
> >         movb    $0, (%rax)
> >         cmpq    %rsi, %rdx
> >         jnb     .L19
> > .L5:
> >         movb    $0, (%rax,%rdx)
> >         movb    $0, 1(%rax,%rdx)
> >         addq    $2, %rdx
> >         cmpq    %rsi, %rdx
> >         jb      .L5
> > .L1:
> >         ret
> >         .p2align 4,,10
> >         .p2align 3
> > .L18:
> >         movq    %rsi, %rdx
> >         xorl    %eax, %eax
> >         andq    $-64, %rdx
> > .L3:
> >         movups  %xmm0, (%rdi,%rax)
> >         movups  %xmm0, 16(%rdi,%rax)
> >         movups  %xmm0, 32(%rdi,%rax)
> >         movups  %xmm0, 48(%rdi,%rax)
> >         addq    $64, %rax
> >         cmpq    %rdx, %rax
> >         jb      .L3
> >         addq    %rdi, %rax
> >         jmp     .L2
> > .L19:
> >         ret
> >         .cfi_endproc
> >
> > with very poor prologue/epilogue.  With this patch, we now generate:
> >
> > foo:
> > .LFB0:
> >         .cfi_startproc
> >         pxor    %xmm0, %xmm0
> >         cmpq    $64, %rsi
> >         jnb     .L2
> >         testb   $32, %sil
> >         jne     .L19
> >         testb   $16, %sil
> >         jne     .L20
> >         testb   $8, %sil
> >         jne     .L21
> >         testb   $4, %sil
> >         jne     .L22
> >         testq   %rsi, %rsi
> >         jne     .L23
> > .L1:
> >         ret
> >         .p2align 4,,10
> >         .p2align 3
> > .L2:
> >         movups  %xmm0, -64(%rdi,%rsi)
> >         movups  %xmm0, -48(%rdi,%rsi)
> >         movups  %xmm0, -32(%rdi,%rsi)
> >         movups  %xmm0, -16(%rdi,%rsi)
> >         subq    $1, %rsi
> >         cmpq    $64, %rsi
> >         jb      .L1
> >         andq    $-64, %rsi
> >         xorl    %eax, %eax
> > .L9:
> >         movups  %xmm0, (%rdi,%rax)
> >         movups  %xmm0, 16(%rdi,%rax)
> >         movups  %xmm0, 32(%rdi,%rax)
> >         movups  %xmm0, 48(%rdi,%rax)
> >         addq    $64, %rax
> >         cmpq    %rsi, %rax
> >         jb      .L9
> >         ret
> >         .p2align 4,,10
> >         .p2align 3
> > .L23:
> >         movb    $0, (%rdi)
> >         testb   $2, %sil
> >         je      .L1
> >         xorl    %eax, %eax
> >         movw    %ax, -2(%rdi,%rsi)
> >         ret
> >         .p2align 4,,10
> >         .p2align 3
> > .L19:
> >         movups  %xmm0, (%rdi)
> >         movups  %xmm0, 16(%rdi)
> >         movups  %xmm0, -32(%rdi,%rsi)
> >         movups  %xmm0, -16(%rdi,%rsi)
> >         ret
> >         .p2align 4,,10
> >         .p2align 3
> > .L20:
> >         movups  %xmm0, (%rdi)
> >         movups  %xmm0, -16(%rdi,%rsi)
> >         ret
> >         .p2align 4,,10
> >         .p2align 3
> > .L21:
> >         movq    $0, (%rdi)
> >         movq    $0, -8(%rdi,%rsi)
> >         ret
> >         .p2align 4,,10
> >         .p2align 3
> > .L22:
> >         movl    $0, (%rdi)
> >         movl    $0, -4(%rdi,%rsi)
> >         ret
> >         .cfi_endproc
>
>
> Here is the v2 patch with the memset improvements:
>
> 1. Always duplicate the promoted scalar value for vector_loop if not 0
> nor -1.
> 2. Update setmem_epilogue_gen_val to use the RTL info from the previous
> iteration.
>
> OK for master?
>

Here is the v3 patch rebased against

commit d073bb6cfc219d4b6c283a0b527ee88b42e640e0
Author: H.J. Lu <hjl.to...@gmail.com>
Date:   Thu Mar 18 18:43:10 2021 -0700

    x86: Update memcpy/memset inline strategies for -mtune=generic

OK for master?

Thanks.

-- 
H.J.
From 2c3f65951232ec44722bc4e0c997a3ec21b84614 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.to...@gmail.com>
Date: Tue, 17 Jun 2025 10:17:17 +0800
Subject: [PATCH v3] x86: Improve vector_loop/unrolled_loop for memset/memcpy

1. Don't generate the loop if the loop count is 1.
2. For memset with vector on small size, use vector if small size supports
vector, otherwise use the scalar value.
3. Always expand vector-version of memset for vector_loop.
4. Always duplicate the promoted scalar value for vector_loop if not 0 nor
-1.
5. Use misaligned prologue if alignment isn't needed.  When misaligned
prologue is used, check if destination is actually aligned and update
destination alignment if aligned.
6. Use move_by_pieces and store_by_pieces for memcpy and memset epilogues
with the fixed epilogue size to enable overlapping moves and stores.

The included tests show that codegen of vector_loop/unrolled_loop for
memset/memcpy are significantly improved.  For

---
void
foo (void *p1, size_t len)
{
  __builtin_memset (p1, 0, len);
}
---

with

-O2 -minline-all-stringops -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -march=x86-64

we used to generate

foo:
.LFB0:
	.cfi_startproc
	movq	%rdi, %rax
	pxor	%xmm0, %xmm0
	cmpq	$64, %rsi
	jnb	.L18
.L2:
	andl	$63, %esi
	je	.L1
	xorl	%edx, %edx
	testb	$1, %sil
	je	.L5
	movl	$1, %edx
	movb	$0, (%rax)
	cmpq	%rsi, %rdx
	jnb	.L19
.L5:
	movb	$0, (%rax,%rdx)
	movb	$0, 1(%rax,%rdx)
	addq	$2, %rdx
	cmpq	%rsi, %rdx
	jb	.L5
.L1:
	ret
	.p2align 4,,10
	.p2align 3
.L18:
	movq	%rsi, %rdx
	xorl	%eax, %eax
	andq	$-64, %rdx
.L3:
	movups	%xmm0, (%rdi,%rax)
	movups	%xmm0, 16(%rdi,%rax)
	movups	%xmm0, 32(%rdi,%rax)
	movups	%xmm0, 48(%rdi,%rax)
	addq	$64, %rax
	cmpq	%rdx, %rax
	jb	.L3
	addq	%rdi, %rax
	jmp	.L2
.L19:
	ret
	.cfi_endproc

with very poor prologue/epilogue.  With this patch, we now generate:

foo:
.LFB0:
	.cfi_startproc
	pxor	%xmm0, %xmm0
	cmpq	$64, %rsi
	jnb	.L2
	testb	$32, %sil
	jne	.L19
	testb	$16, %sil
	jne	.L20
	testb	$8, %sil
	jne	.L21
	testb	$4, %sil
	jne	.L22
	testq	%rsi, %rsi
	jne	.L23
.L1:
	ret
	.p2align 4,,10
	.p2align 3
.L2:
	movups	%xmm0, -64(%rdi,%rsi)
	movups	%xmm0, -48(%rdi,%rsi)
	movups	%xmm0, -32(%rdi,%rsi)
	movups	%xmm0, -16(%rdi,%rsi)
	subq	$1, %rsi
	cmpq	$64, %rsi
	jb	.L1
	andq	$-64, %rsi
	xorl	%eax, %eax
.L9:
	movups	%xmm0, (%rdi,%rax)
	movups	%xmm0, 16(%rdi,%rax)
	movups	%xmm0, 32(%rdi,%rax)
	movups	%xmm0, 48(%rdi,%rax)
	addq	$64, %rax
	cmpq	%rsi, %rax
	jb	.L9
	ret
	.p2align 4,,10
	.p2align 3
.L23:
	movb	$0, (%rdi)
	testb	$2, %sil
	je	.L1
	xorl	%eax, %eax
	movw	%ax, -2(%rdi,%rsi)
	ret
	.p2align 4,,10
	.p2align 3
.L19:
	movups	%xmm0, (%rdi)
	movups	%xmm0, 16(%rdi)
	movups	%xmm0, -32(%rdi,%rsi)
	movups	%xmm0, -16(%rdi,%rsi)
	ret
	.p2align 4,,10
	.p2align 3
.L20:
	movups	%xmm0, (%rdi)
	movups	%xmm0, -16(%rdi,%rsi)
	ret
	.p2align 4,,10
	.p2align 3
.L21:
	movq	$0, (%rdi)
	movq	$0, -8(%rdi,%rsi)
	ret
	.p2align 4,,10
	.p2align 3
.L22:
	movl	$0, (%rdi)
	movl	$0, -4(%rdi,%rsi)
	ret
	.cfi_endproc

gcc/

	PR target/120683
	* config/i386/i386-expand.cc (expand_set_or_cpymem_via_loop):
	Don't generate the loop if the loop count is 1.
	(expand_cpymem_epilogue): Use move_by_pieces.
	(setmem_epilogue_gen_val): New.
	(expand_setmem_epilogue): Use store_by_pieces.
	(expand_small_cpymem_or_setmem): Choose cpymem mode from MOVE_MAX.
	For memset with vector and the size is smaller than the vector
	size, first try the narrower vector, otherwise, use the scalar
	value.
	(promote_duplicated_reg): Duplicate the scalar value for vector.
	(ix86_expand_set_or_cpymem): Always expand vector-version of
	memset for vector_loop.  Use misaligned prologue if alignment
	isn't needed and destination isn't aligned.  Always initialize
	vec_promoted_val from the promoted scalar value for vector_loop.

gcc/testsuite/

	PR target/120683
	* gcc.target/i386/auto-init-padding-9.c: Updated.
	* gcc.target/i386/memcpy-strategy-12.c: Likewise.
	* gcc.target/i386/memset-strategy-25.c: Likewise.
	* gcc.target/i386/memset-strategy-29.c: Likewise.
	* gcc.target/i386/memset-strategy-30.c: Likewise.
	* gcc.target/i386/memset-strategy-31.c: Likewise.
	* gcc.target/i386/memcpy-pr120683-1.c: New test.
	* gcc.target/i386/memcpy-pr120683-2.c: Likewise.
	* gcc.target/i386/memcpy-pr120683-3.c: Likewise.
	* gcc.target/i386/memcpy-pr120683-4.c: Likewise.
	* gcc.target/i386/memcpy-pr120683-5.c: Likewise.
	* gcc.target/i386/memcpy-pr120683-6.c: Likewise.
	* gcc.target/i386/memcpy-pr120683-7.c: Likewise.
	* gcc.target/i386/memset-pr120683-1.c: Likewise.
	* gcc.target/i386/memset-pr120683-2.c: Likewise.
	* gcc.target/i386/memset-pr120683-3.c: Likewise.
	* gcc.target/i386/memset-pr120683-4.c: Likewise.
	* gcc.target/i386/memset-pr120683-5.c: Likewise.
	* gcc.target/i386/memset-pr120683-6.c: Likewise.
	* gcc.target/i386/memset-pr120683-7.c: Likewise.
	* gcc.target/i386/memset-pr120683-8.c: Likewise.
	* gcc.target/i386/memset-pr120683-9.c: Likewise.
	* gcc.target/i386/memset-pr120683-10.c: Likewise.
	* gcc.target/i386/memset-pr120683-11.c: Likewise.
	* gcc.target/i386/memset-pr120683-12.c: Likewise.
	* gcc.target/i386/memset-pr120683-13.c: Likewise.
	* gcc.target/i386/memset-pr120683-14.c: Likewise.
	* gcc.target/i386/memset-pr120683-15.c: Likewise.
	* gcc.target/i386/memset-pr120683-16.c: Likewise.
	* gcc.target/i386/memset-pr120683-17.c: Likewise.
	* gcc.target/i386/memset-pr120683-18.c: Likewise.
	* gcc.target/i386/memset-pr120683-19.c: Likewise.
	* gcc.target/i386/memset-pr120683-20.c: Likewise.
	* gcc.target/i386/memset-pr120683-21.c: Likewise.
	* gcc.target/i386/memset-pr120683-22.c: Likewise.
	* gcc.target/i386/memset-pr120683-23.c: Likewise.

Signed-off-by: H.J. Lu <hjl.to...@gmail.com>
---
 gcc/config/i386/i386-expand.cc                | 271 ++++++++++++++----
 .../gcc.target/i386/auto-init-padding-9.c     |  23 +-
 .../gcc.target/i386/memcpy-pr120683-1.c       |  42 +++
 .../gcc.target/i386/memcpy-pr120683-2.c       |  41 +++
 .../gcc.target/i386/memcpy-pr120683-3.c       |  43 +++
 .../gcc.target/i386/memcpy-pr120683-4.c       |  42 +++
 .../gcc.target/i386/memcpy-pr120683-5.c       |  44 +++
 .../gcc.target/i386/memcpy-pr120683-6.c       |  42 +++
 .../gcc.target/i386/memcpy-pr120683-7.c       |  44 +++
 .../gcc.target/i386/memcpy-strategy-12.c      |  12 +-
 .../gcc.target/i386/memset-pr120683-1.c       |  35 +++
 .../gcc.target/i386/memset-pr120683-10.c      |  28 ++
 .../gcc.target/i386/memset-pr120683-11.c      |  29 ++
 .../gcc.target/i386/memset-pr120683-12.c      |  31 ++
 .../gcc.target/i386/memset-pr120683-13.c      |  36 +++
 .../gcc.target/i386/memset-pr120683-14.c      |  91 ++++++
 .../gcc.target/i386/memset-pr120683-15.c      | 103 +++++++
 .../gcc.target/i386/memset-pr120683-16.c      | 112 ++++++++
 .../gcc.target/i386/memset-pr120683-17.c      |  37 +++
 .../gcc.target/i386/memset-pr120683-18.c      |  37 +++
 .../gcc.target/i386/memset-pr120683-19.c      |  37 +++
 .../gcc.target/i386/memset-pr120683-2.c       |  30 ++
 .../gcc.target/i386/memset-pr120683-20.c      |  38 +++
 .../gcc.target/i386/memset-pr120683-21.c      |  38 +++
 .../gcc.target/i386/memset-pr120683-22.c      |  27 ++
 .../gcc.target/i386/memset-pr120683-23.c      |  67 +++++
 .../gcc.target/i386/memset-pr120683-3.c       |  26 ++
 .../gcc.target/i386/memset-pr120683-4.c       |  93 ++++++
 .../gcc.target/i386/memset-pr120683-5.c       | 102 +++++++
 .../gcc.target/i386/memset-pr120683-6.c       | 109 +++++++
 .../gcc.target/i386/memset-pr120683-7.c       |  94 ++++++
 .../gcc.target/i386/memset-pr120683-8.c       | 103 +++++++
 .../gcc.target/i386/memset-pr120683-9.c       | 110 +++++++
 .../gcc.target/i386/memset-strategy-25.c      |   5 +
 .../gcc.target/i386/memset-strategy-29.c      |   5 +
 .../gcc.target/i386/memset-strategy-30.c      |   5 +
 .../gcc.target/i386/memset-strategy-31.c      |   4 +
 37 files changed, 1967 insertions(+), 69 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-pr120683-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-pr120683-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-pr120683-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-pr120683-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-pr120683-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-pr120683-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-pr120683-7.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-10.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-11.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-12.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-13.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-14.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-15.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-16.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-17.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-18.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-19.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-20.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-21.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-22.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-23.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-7.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-8.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-pr120683-9.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 423fc632003..edb6eb8eadc 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -7899,7 +7899,8 @@ expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
 			       rtx count, machine_mode mode, int unroll,
 			       int expected_size, bool issetmem)
 {
-  rtx_code_label *out_label, *top_label;
+  rtx_code_label *out_label = nullptr;
+  rtx_code_label *top_label = nullptr;
   rtx iter, tmp;
   machine_mode iter_mode = counter_mode (count);
   int piece_size_n = GET_MODE_SIZE (mode) * unroll;
@@ -7907,9 +7908,19 @@ expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
   rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
   rtx size;
   int i;
+  int loop_count;
 
-  top_label = gen_label_rtx ();
-  out_label = gen_label_rtx ();
+  if (expected_size != -1 && CONST_INT_P (count))
+    loop_count = INTVAL (count) / GET_MODE_SIZE (mode) / unroll;
+  else
+    loop_count = -1;
+
+  /* Don't generate the loop if the loop count is 1.  */
+  if (loop_count != 1)
+    {
+      top_label = gen_label_rtx ();
+      out_label = gen_label_rtx ();
+    }
   iter = gen_reg_rtx (iter_mode);
 
   size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
@@ -7923,7 +7934,8 @@ expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
     }
   emit_move_insn (iter, const0_rtx);
 
-  emit_label (top_label);
+  if (loop_count != 1)
+    emit_label (top_label);
 
   tmp = convert_modes (Pmode, iter_mode, iter, true);
 
@@ -7991,21 +8003,25 @@ expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
   if (tmp != iter)
     emit_move_insn (iter, tmp);
 
-  emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
-			   true, top_label);
-  if (expected_size != -1)
+  if (loop_count != 1)
     {
-      expected_size /= GET_MODE_SIZE (mode) * unroll;
-      if (expected_size == 0)
-	predict_jump (0);
-      else if (expected_size > REG_BR_PROB_BASE)
-	predict_jump (REG_BR_PROB_BASE - 1);
+      emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
+			       true, top_label);
+      if (expected_size != -1)
+	{
+	  expected_size /= GET_MODE_SIZE (mode) * unroll;
+	  if (expected_size == 0)
+	    predict_jump (0);
+	  else if (expected_size > REG_BR_PROB_BASE)
+	    predict_jump (REG_BR_PROB_BASE - 1);
+	  else
+	    predict_jump (REG_BR_PROB_BASE
+			  - (REG_BR_PROB_BASE + expected_size / 2)
+			    / expected_size);
+	}
       else
-        predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
-		      / expected_size);
+	predict_jump (REG_BR_PROB_BASE * 80 / 100);
     }
-  else
-    predict_jump (REG_BR_PROB_BASE * 80 / 100);
   iter = ix86_zero_extend_to_Pmode (iter);
   tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
 			     true, OPTAB_LIB_WIDEN);
@@ -8018,7 +8034,8 @@ expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
       if (tmp != srcptr)
 	emit_move_insn (srcptr, tmp);
     }
-  emit_label (out_label);
+  if (loop_count != 1)
+    emit_label (out_label);
 }
 
 /* Divide COUNTREG by SCALE.  */
@@ -8221,19 +8238,11 @@ expand_cpymem_epilogue (rtx destmem, rtx srcmem,
   rtx src, dest;
   if (CONST_INT_P (count))
     {
-      HOST_WIDE_INT countval = INTVAL (count);
-      HOST_WIDE_INT epilogue_size = countval % max_size;
-      int i;
-
-      /* For now MAX_SIZE should be a power of 2.  This assert could be
-	 relaxed, but it'll require a bit more complicated epilogue
-	 expanding.  */
-      gcc_assert ((max_size & (max_size - 1)) == 0);
-      for (i = max_size; i >= 1; i >>= 1)
-	{
-	  if (epilogue_size & i)
-	    destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
-	}
+      unsigned HOST_WIDE_INT countval = UINTVAL (count);
+      unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
+      unsigned int destalign = MEM_ALIGN (destmem);
+      move_by_pieces (destmem, srcmem, epilogue_size, destalign,
+		      RETURN_BEGIN);
       return;
     }
   if (max_size > 8)
@@ -8394,6 +8403,81 @@ expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
 				 1, max_size / 2, true);
 }
 
+/* Callback routine for store_by_pieces.  Return the RTL of a register
+   containing GET_MODE_SIZE (MODE) bytes in the RTL register op_p which
+   is a word or a word vector register.  If PREV_P isn't nullptr, it
+   has the RTL info from the previous iteration.  */
+
+static rtx
+setmem_epilogue_gen_val (void *op_p, void *prev_p, HOST_WIDE_INT,
+			 fixed_size_mode mode)
+{
+  rtx target;
+  by_pieces_prev *prev = (by_pieces_prev *) prev_p;
+  if (prev)
+    {
+      rtx prev_op = prev->data;
+      if (prev_op)
+	{
+	  machine_mode prev_mode = GET_MODE (prev_op);
+	  if (prev_mode == mode)
+	    return prev_op;
+	  if (VECTOR_MODE_P (prev_mode)
+	      && VECTOR_MODE_P (mode)
+	      && GET_MODE_INNER (prev_mode) == GET_MODE_INNER (mode))
+	    {
+	      target = gen_rtx_SUBREG (mode, prev_op, 0);
+	      return target;
+	    }
+	}
+    }
+
+  rtx op = (rtx) op_p;
+  machine_mode op_mode = GET_MODE (op);
+
+  gcc_assert (op_mode == word_mode
+	      || (VECTOR_MODE_P (op_mode)
+		  && GET_MODE_INNER (op_mode) == word_mode));
+
+  if (VECTOR_MODE_P (mode))
+    {
+      gcc_assert (GET_MODE_INNER (mode) == QImode);
+
+      unsigned int op_size = GET_MODE_SIZE (op_mode);
+      unsigned int size = GET_MODE_SIZE (mode);
+      unsigned int nunits = op_size / GET_MODE_SIZE (QImode);
+      machine_mode vec_mode
+	= mode_for_vector (QImode, nunits).require ();
+      target = gen_reg_rtx (vec_mode);
+      op = gen_rtx_SUBREG (vec_mode, op, 0);
+      emit_move_insn (target, op);
+      if (op_size == size)
+	return target;
+
+      rtx tmp = gen_reg_rtx (mode);
+      target = gen_rtx_SUBREG (mode, target, 0);
+      emit_move_insn (tmp, target);
+      return tmp;
+    }
+
+  target = gen_reg_rtx (word_mode);
+  if (VECTOR_MODE_P (op_mode))
+    {
+      op = gen_rtx_SUBREG (word_mode, op, 0);
+      emit_move_insn (target, op);
+    }
+  else
+    target = op;
+
+  if (mode == word_mode)
+    return target;
+
+  rtx tmp = gen_reg_rtx (mode);
+  target = gen_rtx_SUBREG (mode, target, 0);
+  emit_move_insn (tmp, target);
+  return tmp;
+}
+
 /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
 static void
 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
@@ -8403,24 +8487,12 @@ expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
 
   if (CONST_INT_P (count))
     {
-      HOST_WIDE_INT countval = INTVAL (count);
-      HOST_WIDE_INT epilogue_size = countval % max_size;
-      int i;
-
-      /* For now MAX_SIZE should be a power of 2.  This assert could be
-	 relaxed, but it'll require a bit more complicated epilogue
-	 expanding.  */
-      gcc_assert ((max_size & (max_size - 1)) == 0);
-      for (i = max_size; i >= 1; i >>= 1)
-	{
-	  if (epilogue_size & i)
-	    {
-	      if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
-		destmem = emit_memset (destmem, destptr, vec_value, i);
-	      else
-		destmem = emit_memset (destmem, destptr, value, i);
-	    }
-	}
+      unsigned HOST_WIDE_INT countval = UINTVAL (count);
+      unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
+      unsigned int destalign = MEM_ALIGN (destmem);
+      store_by_pieces (destmem, epilogue_size, setmem_epilogue_gen_val,
+		       vec_value ? vec_value : value, destalign, true,
+		       RETURN_BEGIN);
       return;
     }
   if (max_size > 32)
@@ -8552,6 +8624,7 @@ expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
   rtx_code_label *label = ix86_expand_aligntest (count, size, false);
   machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
   rtx modesize;
+  rtx scalar_value = value;
   int n;
 
   /* If we do not have vector value to copy, we must reduce size.  */
@@ -8571,11 +8644,57 @@ expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
     {
       /* Choose appropriate vector mode.  */
       if (size >= 32)
-	mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
+	switch (MOVE_MAX)
+	  {
+	  case 64:
+	    if (size >= 64)
+	      {
+		mode = V64QImode;
+		break;
+	      }
+	    /* FALLTHRU */
+	  case 32:
+	    mode = V32QImode;
+	    break;
+	  case 16:
+	    mode = V16QImode;
+	    break;
+	  case 8:
+	    mode = DImode;
+	    break;
+	  default:
+	    gcc_unreachable ();
+	  }
       else if (size >= 16)
 	mode = TARGET_SSE ? V16QImode : DImode;
       srcmem = change_address (srcmem, mode, srcptr);
     }
+  if (issetmem && vec_value && GET_MODE_SIZE (mode) > size)
+    {
+      /* For memset with vector and the size is smaller than the vector
+	 size, first try the narrower vector, otherwise, use the
+	 original value. */
+      machine_mode inner_mode = GET_MODE_INNER (mode);
+      unsigned int nunits = size / GET_MODE_SIZE (inner_mode);
+      if (nunits > 1)
+	{
+	  mode = mode_for_vector (GET_MODE_INNER (mode),
+				  nunits).require ();
+	  value = gen_rtx_SUBREG (mode, value, 0);
+	}
+      else
+	{
+	  scalar_int_mode smode
+	    = smallest_int_mode_for_size (size * BITS_PER_UNIT).require ();
+	  gcc_assert (GET_MODE_SIZE (GET_MODE (scalar_value))
+		      >= GET_MODE_SIZE (smode));
+	  mode = smode;
+	  if (GET_MODE (scalar_value) == mode)
+	    value = scalar_value;
+	  else
+	    value = gen_rtx_SUBREG (mode, scalar_value, 0);
+	}
+    }
   destmem = change_address (destmem, mode, destptr);
   modesize = GEN_INT (GET_MODE_SIZE (mode));
   gcc_assert (GET_MODE_SIZE (mode) <= size);
@@ -9179,13 +9298,26 @@ decide_alignment (int align,
 static rtx
 promote_duplicated_reg (machine_mode mode, rtx val)
 {
+  if (val == const0_rtx)
+    return copy_to_mode_reg (mode, CONST0_RTX (mode));
+
   machine_mode valmode = GET_MODE (val);
+  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+    {
+      /* Duplicate the scalar value for integer vector.  */
+      gcc_assert ((val == const0_rtx || val == constm1_rtx)
+		  || GET_MODE_INNER (mode) == valmode);
+      rtx dup = gen_reg_rtx (mode);
+      bool ok = ix86_expand_vector_init_duplicate (false, mode, dup,
+						   val);
+      gcc_assert (ok);
+      return dup;
+    }
+
   rtx tmp;
   int nops = mode == DImode ? 3 : 2;
 
-  gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
-  if (val == const0_rtx)
-    return copy_to_mode_reg (mode, CONST0_RTX (mode));
+  gcc_assert (mode == SImode || mode == DImode);
   if (CONST_INT_P (val))
     {
       HOST_WIDE_INT v = INTVAL (val) & 255;
@@ -9413,11 +9545,6 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
     return false;
   gcc_assert (alg != no_stringop);
 
-  /* For now vector-version of memset is generated only for memory zeroing, as
-     creating of promoted vector value is very cheap in this case.  */
-  if (issetmem && alg == vector_loop && val_exp != const0_rtx)
-    alg = unrolled_loop;
-
   if (!count)
     count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
   destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
@@ -9510,20 +9637,41 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
        && ((desired_align > align && !align_bytes)
 	   || (!count && epilogue_size_needed > 1)));
 
+  /* Destination is aligned after the misaligned prologue.  */
+  bool aligned_dstmem = misaligned_prologue_used;
+
+  if (noalign && !misaligned_prologue_used)
+    {
+      /* Also use misaligned prologue if alignment isn't needed and
+	 destination isn't aligned.   Since alignment isn't needed,
+	 the destination after prologue won't be aligned.  */
+      aligned_dstmem = (GET_MODE_ALIGNMENT (move_mode)
+			<= MEM_ALIGN (dst));
+      if (!aligned_dstmem)
+	misaligned_prologue_used = true;
+    }
+
   /* Do the cheap promotion to allow better CSE across the
      main loop and epilogue (ie one load of the big constant in the
      front of all code.
      For now the misaligned move sequences do not have fast path
      without broadcasting.  */
-  if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
+  if (issetmem
+      && (alg == vector_loop
+	  || CONST_INT_P (val_exp)
+	  || misaligned_prologue_used))
     {
       if (alg == vector_loop)
 	{
-	  gcc_assert (val_exp == const0_rtx);
-	  vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
 	  promoted_val = promote_duplicated_reg_to_size (val_exp,
 							 GET_MODE_SIZE (word_mode),
 							 desired_align, align);
+	  /* Duplicate the promoted scalar value if not 0 nor -1.  */
+	  vec_promoted_val
+	    = promote_duplicated_reg (move_mode,
+				      (val_exp == const0_rtx
+				       || val_exp == constm1_rtx)
+				      ? val_exp : promoted_val);
 	}
       else
 	{
@@ -9548,7 +9696,8 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
       if (!issetmem)
         src = change_address (src, BLKmode, srcreg);
       dst = change_address (dst, BLKmode, destreg);
-      set_mem_align (dst, desired_align * BITS_PER_UNIT);
+      if (aligned_dstmem)
+	set_mem_align (dst, desired_align * BITS_PER_UNIT);
       epilogue_size_needed = 0;
       if (need_zero_guard
 	  && min_size < (unsigned HOST_WIDE_INT) size_needed)
diff --git a/gcc/testsuite/gcc.target/i386/auto-init-padding-9.c b/gcc/testsuite/gcc.target/i386/auto-init-padding-9.c
index 102217ce2c1..4f26aa47802 100644
--- a/gcc/testsuite/gcc.target/i386/auto-init-padding-9.c
+++ b/gcc/testsuite/gcc.target/i386/auto-init-padding-9.c
@@ -8,17 +8,28 @@
 /*
 **foo:
 **...
+**	leaq	-160\(%rbp\), %rax
+**	movq	%rax, %rcx
 **	pxor	%xmm0, %xmm0
-**...
+**	movl	\$160, %edx
+**	movl	%edx, %edi
+**	andl	\$-64, %edi
+**	movl	\$0, %esi
 **.L[0-9]+:
-**	movl	%esi, %ecx
-**	movaps	%xmm0, \(%rdx,%rcx\)
-**	movaps	%xmm0, 16\(%rdx,%rcx\)
-**	movaps	%xmm0, 32\(%rdx,%rcx\)
-**	movaps	%xmm0, 48\(%rdx,%rcx\)
+**	movl	%esi, %edx
+**	movaps	%xmm0, \(%rax,%rdx\)
+**	movaps	%xmm0, 16\(%rax,%rdx\)
+**	movaps	%xmm0, 32\(%rax,%rdx\)
+**	movaps	%xmm0, 48\(%rax,%rdx\)
 **	addl	\$64, %esi
 **	cmpl	%edi, %esi
 **	jb	.L[0-9]+
+**	movl	%esi, %eax
+**	addq	%rax, %rcx
+**	movaps	%xmm0, \(%rcx\)
+**	movaps	%xmm0, 16\(%rcx\)
+**	movzbl	-116\(%rbp\), %eax
+**	movsbl	%al, %eax
 **...
 */
 
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-1.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-1.c
new file mode 100644
index 00000000000..753238e35fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-1.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-sse -mmemcpy-strategy=unrolled_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	movq	221\(%rsi\), %rax
+**	xorl	%edx, %edx
+**	movq	%rax, 221\(%rdi\)
+**	movq	229\(%rsi\), %rax
+**	movq	%rax, 229\(%rdi\)
+**	movq	237\(%rsi\), %rax
+**	movq	%rax, 237\(%rdi\)
+**	movq	245\(%rsi\), %rax
+**	movq	%rax, 245\(%rdi\)
+**.L[0-9]+:
+**	movl	%edx, %eax
+**	addl	\$32, %edx
+**	movq	\(%rsi,%rax\), %r10
+**	movq	8\(%rsi,%rax\), %r9
+**	movq	16\(%rsi,%rax\), %r8
+**	movq	24\(%rsi,%rax\), %rcx
+**	movq	%r10, \(%rdi,%rax\)
+**	movq	%r9, 8\(%rdi,%rax\)
+**	movq	%r8, 16\(%rdi,%rax\)
+**	movq	%rcx, 24\(%rdi,%rax\)
+**	cmpl	\$224, %edx
+**	jb	.L[0-9]+
+**	ret
+**...
+*/
+
+void
+foo (char *dest, char *src)
+{
+  __builtin_memcpy (dest, src, 253);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-2.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-2.c
new file mode 100644
index 00000000000..9b0fb0638ff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-2.c
@@ -0,0 +1,41 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	xorl	%edx, %edx
+**.L[0-9]+:
+**	movl	%edx, %eax
+**	addl	\$64, %edx
+**	movdqa	src\(%rax\), %xmm3
+**	movdqa	src\+16\(%rax\), %xmm2
+**	movdqa	src\+32\(%rax\), %xmm1
+**	movdqa	src\+48\(%rax\), %xmm0
+**	movaps	%xmm3, dest\(%rax\)
+**	movaps	%xmm2, dest\+16\(%rax\)
+**	movaps	%xmm1, dest\+32\(%rax\)
+**	movaps	%xmm0, dest\+48\(%rax\)
+**	cmpl	\$256, %edx
+**	jb	.L[0-9]+
+**	movdqa	src\(%rdx\), %xmm0
+**	movaps	%xmm0, dest\(%rdx\)
+**	ret
+**...
+*/
+
+#define SIZE (16 + 1) * 16
+
+char dest[SIZE];
+char src[SIZE];
+
+void
+foo (void)
+{
+  __builtin_memcpy (dest, src, SIZE);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-3.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-3.c
new file mode 100644
index 00000000000..600459b2a2e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-3.c
@@ -0,0 +1,43 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	xorl	%edx, %edx
+**.L[0-9]+:
+**	movl	%edx, %eax
+**	addl	\$64, %edx
+**	movdqa	src\(%rax\), %xmm3
+**	movdqa	src\+16\(%rax\), %xmm2
+**	movdqa	src\+32\(%rax\), %xmm1
+**	movdqa	src\+48\(%rax\), %xmm0
+**	movaps	%xmm3, dest\(%rax\)
+**	movaps	%xmm2, dest\+16\(%rax\)
+**	movaps	%xmm1, dest\+32\(%rax\)
+**	movaps	%xmm0, dest\+48\(%rax\)
+**	cmpl	\$256, %edx
+**	jb	.L[0-9]+
+**	movdqa	src\(%rdx\), %xmm0
+**	movaps	%xmm0, dest\(%rdx\)
+**	movdqu	src\+15\(%rdx\), %xmm0
+**	movups	%xmm0, dest\+15\(%rdx\)
+**	ret
+**...
+*/
+
+#define SIZE 16 * 16 + 31
+
+char dest[SIZE];
+char src[SIZE];
+
+void
+foo (void)
+{
+  __builtin_memcpy (dest, src, SIZE);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-4.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-4.c
new file mode 100644
index 00000000000..14833ff8957
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-4.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	xorl	%edx, %edx
+**.L[0-9]+:
+**	movl	%edx, %eax
+**	subl	\$-128, %edx
+**	vmovdqa	src\(%rax\), %ymm3
+**	vmovdqa	src\+32\(%rax\), %ymm2
+**	vmovdqa	src\+64\(%rax\), %ymm1
+**	vmovdqa	src\+96\(%rax\), %ymm0
+**	vmovdqa	%ymm3, dest\(%rax\)
+**	vmovdqa	%ymm2, dest\+32\(%rax\)
+**	vmovdqa	%ymm1, dest\+64\(%rax\)
+**	vmovdqa	%ymm0, dest\+96\(%rax\)
+**	cmpl	\$512, %edx
+**	jb	.L[0-9]+
+**	vmovdqa	src\(%rdx\), %ymm0
+**	vmovdqa	%ymm0, dest\(%rdx\)
+**	vzeroupper
+**	ret
+**...
+*/
+
+#define SIZE (16 + 1) * 32
+
+char dest[SIZE];
+char src[SIZE];
+
+void
+foo (void)
+{
+  __builtin_memcpy (dest, src, SIZE);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-5.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-5.c
new file mode 100644
index 00000000000..15ffed9dea1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-5.c
@@ -0,0 +1,44 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	xorl	%edx, %edx
+**.L[0-9]+:
+**	movl	%edx, %eax
+**	subl	\$-128, %edx
+**	vmovdqa	src\(%rax\), %ymm3
+**	vmovdqa	src\+32\(%rax\), %ymm2
+**	vmovdqa	src\+64\(%rax\), %ymm1
+**	vmovdqa	src\+96\(%rax\), %ymm0
+**	vmovdqa	%ymm3, dest\(%rax\)
+**	vmovdqa	%ymm2, dest\+32\(%rax\)
+**	vmovdqa	%ymm1, dest\+64\(%rax\)
+**	vmovdqa	%ymm0, dest\+96\(%rax\)
+**	cmpl	\$512, %edx
+**	jb	.L[0-9]+
+**	vmovdqa	src\(%rdx\), %ymm0
+**	vmovdqa	%ymm0, dest\(%rdx\)
+**	vmovdqu	src\+31\(%rdx\), %ymm0
+**	vmovdqu	%ymm0, dest\+31\(%rdx\)
+**	vzeroupper
+**	ret
+**...
+*/
+
+#define SIZE 16 * 32 + 32 + 31
+
+char dest[SIZE];
+char src[SIZE];
+
+void
+foo (void)
+{
+  __builtin_memcpy (dest, src, SIZE);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-6.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-6.c
new file mode 100644
index 00000000000..d57dcc15116
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-6.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	xorl	%edx, %edx
+**.L[0-9]+:
+**	movl	%edx, %eax
+**	addl	\$256, %edx
+**	vmovdqa64	src\(%rax\), %zmm3
+**	vmovdqa64	src\+64\(%rax\), %zmm2
+**	vmovdqa64	src\+128\(%rax\), %zmm1
+**	vmovdqa64	src\+192\(%rax\), %zmm0
+**	vmovdqa64	%zmm3, dest\(%rax\)
+**	vmovdqa64	%zmm2, dest\+64\(%rax\)
+**	vmovdqa64	%zmm1, dest\+128\(%rax\)
+**	vmovdqa64	%zmm0, dest\+192\(%rax\)
+**	cmpl	\$1024, %edx
+**	jb	.L[0-9]+
+**	vmovdqa64	src\(%rdx\), %zmm0
+**	vmovdqa64	%zmm0, dest\(%rdx\)
+**	vzeroupper
+**	ret
+**...
+*/
+
+#define SIZE (16 + 1) * 64
+
+char dest[SIZE] __attribute__((aligned(64)));
+char src[SIZE] __attribute__((aligned(64)));
+
+void
+foo (void)
+{
+  __builtin_memcpy (dest, src, SIZE);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-pr120683-7.c b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-7.c
new file mode 100644
index 00000000000..d9eb77d26af
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-pr120683-7.c
@@ -0,0 +1,44 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -mmemcpy-strategy=vector_loop:2048:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	xorl	%edx, %edx
+**.L[0-9]+:
+**	movl	%edx, %eax
+**	addl	\$256, %edx
+**	vmovdqa64	src\(%rax\), %zmm3
+**	vmovdqa64	src\+64\(%rax\), %zmm2
+**	vmovdqa64	src\+128\(%rax\), %zmm1
+**	vmovdqa64	src\+192\(%rax\), %zmm0
+**	vmovdqa64	%zmm3, dest\(%rax\)
+**	vmovdqa64	%zmm2, dest\+64\(%rax\)
+**	vmovdqa64	%zmm1, dest\+128\(%rax\)
+**	vmovdqa64	%zmm0, dest\+192\(%rax\)
+**	cmpl	\$1024, %edx
+**	jb	.L[0-9]+
+**	vmovdqa	src\(%rdx\), %ymm0
+**	vmovdqa	%ymm0, dest\(%rdx\)
+**	vmovdqu	src\+31\(%rdx\), %ymm0
+**	vmovdqu	%ymm0, dest\+31\(%rdx\)
+**	vzeroupper
+**	ret
+**...
+*/
+
+#define SIZE 16 * 64 + 63
+
+char dest[SIZE] __attribute__((aligned(64)));
+char src[SIZE] __attribute__((aligned(64)));
+
+void
+foo (void)
+{
+  __builtin_memcpy (dest, src, SIZE);
+}
+
+/* { dg-final { scan-assembler-not "rep mov" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c
index d0316efc8ee..47160864ea7 100644
--- a/gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c
+++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c
@@ -6,9 +6,16 @@
 /*
 **foo:
 **.LFB[0-9]+:
-**...
+**	.cfi_startproc
+**	movq	221\(%rsi\), %rax
 **	xorl	%edx, %edx
-**...
+**	movq	%rax, 221\(%rdi\)
+**	movq	229\(%rsi\), %rax
+**	movq	%rax, 229\(%rdi\)
+**	movq	237\(%rsi\), %rax
+**	movq	%rax, 237\(%rdi\)
+**	movq	245\(%rsi\), %rax
+**	movq	%rax, 245\(%rdi\)
 **.L[0-9]+:
 **	movl	%edx, %eax
 **	addl	\$32, %edx
@@ -22,6 +29,7 @@
 **	movq	%rcx, 24\(%rdi,%rax\)
 **	cmpl	\$224, %edx
 **	jb	.L[0-9]+
+**	ret
 **...
 */
 
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-1.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-1.c
new file mode 100644
index 00000000000..90e544df7ab
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-1.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	pxor	%xmm0, %xmm0
+**	xorl	%eax, %eax
+**	movups	%xmm0, 190\(%rdi\)
+**	movups	%xmm0, 206\(%rdi\)
+**	movups	%xmm0, 222\(%rdi\)
+**	movups	%xmm0, 238\(%rdi\)
+**.L[0-9]+:
+**	movl	%eax, %edx
+**	addl	\$64, %eax
+**	movups	%xmm0, \(%rdi,%rdx\)
+**	movups	%xmm0, 16\(%rdi,%rdx\)
+**	movups	%xmm0, 32\(%rdi,%rdx\)
+**	movups	%xmm0, 48\(%rdi,%rdx\)
+**	cmpl	\$192, %eax
+**	jb	.L[0-9]+
+**	ret
+**...
+*/
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 254);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-10.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-10.c
new file mode 100644
index 00000000000..6d3d9e750e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-10.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-sse -mmemset-strategy=unrolled_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	movq	\$0, 48\(%rdi\)
+**	movq	\$0, \(%rdi\)
+**	movq	\$0, 8\(%rdi\)
+**	movq	\$0, 16\(%rdi\)
+**	movq	\$0, 24\(%rdi\)
+**	movq	\$0, 32\(%rdi\)
+**	movq	\$0, 40\(%rdi\)
+**	movq	\$0, 53\(%rdi\)
+**	ret
+**...
+*/
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 61);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-11.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-11.c
new file mode 100644
index 00000000000..30b0cad04e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-11.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-sse -mmemset-strategy=unrolled_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	movabsq	\$289360691352306692, %rax
+**	movq	%rax, 48\(%rdi\)
+**	movq	%rax, \(%rdi\)
+**	movq	%rax, 8\(%rdi\)
+**	movq	%rax, 16\(%rdi\)
+**	movq	%rax, 24\(%rdi\)
+**	movq	%rax, 32\(%rdi\)
+**	movq	%rax, 40\(%rdi\)
+**	movq	%rax, 53\(%rdi\)
+**	ret
+**...
+*/
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 4, 61);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-12.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-12.c
new file mode 100644
index 00000000000..15987a6451f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-12.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-sse -mmemset-strategy=unrolled_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	movabsq	\$72340172838076673, %rax
+**	movzbl	%sil, %esi
+**	imulq	%rax, %rsi
+**	movq	%rsi, 48\(%rdi\)
+**	movq	%rsi, \(%rdi\)
+**	movq	%rsi, 8\(%rdi\)
+**	movq	%rsi, 16\(%rdi\)
+**	movq	%rsi, 24\(%rdi\)
+**	movq	%rsi, 32\(%rdi\)
+**	movq	%rsi, 40\(%rdi\)
+**	movq	%rsi, 53\(%rdi\)
+**	ret
+**...
+*/
+
+void
+foo (char *dest, int c)
+{
+  __builtin_memset (dest, c, 61);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-13.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-13.c
new file mode 100644
index 00000000000..3da6ca7defd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-13.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	pxor	%xmm0, %xmm0
+**	xorl	%eax, %eax
+**.L[0-9]+:
+**	movl	%eax, %edx
+**	addl	\$64, %eax
+**	movaps	%xmm0, dest\(%rdx\)
+**	movaps	%xmm0, dest\+16\(%rdx\)
+**	movaps	%xmm0, dest\+32\(%rdx\)
+**	movaps	%xmm0, dest\+48\(%rdx\)
+**	cmpl	\$192, %eax
+**	jb	.L[0-9]+
+**	movaps	%xmm0, dest\(%rax\)
+**	movaps	%xmm0, dest\+16\(%rax\)
+**	movaps	%xmm0, dest\+32\(%rax\)
+**	ret
+**...
+*/
+
+char dest[240];
+
+void
+foo (void)
+{
+  __builtin_memset (dest, 0, sizeof (dest));
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-14.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-14.c
new file mode 100644
index 00000000000..7ec9b3fe1bd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-14.c
@@ -0,0 +1,91 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+**	.cfi_startproc
+**	pxor	%xmm0, %xmm0
+**	cmpq	\$64, %rsi
+**	jnb	.L2
+**	testb	\$32, %sil
+**	jne	.L19
+**	testb	\$16, %sil
+**	jne	.L20
+**	testb	\$8, %sil
+**	jne	.L21
+**	testb	\$4, %sil
+**	jne	.L22
+**	testq	%rsi, %rsi
+**	jne	.L23
+**.L1:
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L2:
+**	movups	%xmm0, -64\(%rdi,%rsi\)
+**	movups	%xmm0, -48\(%rdi,%rsi\)
+**	movups	%xmm0, -32\(%rdi,%rsi\)
+**	movups	%xmm0, -16\(%rdi,%rsi\)
+**	subq	\$1, %rsi
+**	cmpq	\$64, %rsi
+**	jb	.L1
+**	andq	\$-64, %rsi
+**	xorl	%eax, %eax
+**.L9:
+**	movups	%xmm0, \(%rdi,%rax\)
+**	movups	%xmm0, 16\(%rdi,%rax\)
+**	movups	%xmm0, 32\(%rdi,%rax\)
+**	movups	%xmm0, 48\(%rdi,%rax\)
+**	addq	\$64, %rax
+**	cmpq	%rsi, %rax
+**	jb	.L9
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L23:
+**	movb	\$0, \(%rdi\)
+**	testb	\$2, %sil
+**	je	.L1
+**	xorl	%eax, %eax
+**	movw	%ax, -2\(%rdi,%rsi\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L19:
+**	movups	%xmm0, \(%rdi\)
+**	movups	%xmm0, 16\(%rdi\)
+**	movups	%xmm0, -32\(%rdi,%rsi\)
+**	movups	%xmm0, -16\(%rdi,%rsi\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L20:
+**	movups	%xmm0, \(%rdi\)
+**	movups	%xmm0, -16\(%rdi,%rsi\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L21:
+**	movq	\$0, \(%rdi\)
+**	movq	\$0, -8\(%rdi,%rsi\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L22:
+**	movl	\$0, \(%rdi\)
+**	movl	\$0, -4\(%rdi,%rsi\)
+**	ret
+**	.cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, 0, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-15.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-15.c
new file mode 100644
index 00000000000..e7544057994
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-15.c
@@ -0,0 +1,103 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+**	.cfi_startproc
+**	vpxor	%xmm0, %xmm0, %xmm0
+**	cmpq	\$128, %rsi
+**	jnb	.L2
+**	testb	\$64, %sil
+**	jne	.L22
+**	testb	\$32, %sil
+**	jne	.L23
+**	testb	\$16, %sil
+**	jne	.L24
+**	testb	\$8, %sil
+**	jne	.L25
+**	testb	\$4, %sil
+**	jne	.L26
+**	testq	%rsi, %rsi
+**	jne	.L27
+**.L20:
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L2:
+**	vmovdqu	%ymm0, -128\(%rdi,%rsi\)
+**	vmovdqu	%ymm0, -96\(%rdi,%rsi\)
+**	vmovdqu	%ymm0, -64\(%rdi,%rsi\)
+**	vmovdqu	%ymm0, -32\(%rdi,%rsi\)
+**	subq	\$1, %rsi
+**	cmpq	\$128, %rsi
+**	jb	.L19
+**	andq	\$-128, %rsi
+**	xorl	%eax, %eax
+**.L10:
+**	vmovdqu	%ymm0, \(%rdi,%rax\)
+**	vmovdqu	%ymm0, 32\(%rdi,%rax\)
+**	vmovdqu	%ymm0, 64\(%rdi,%rax\)
+**	vmovdqu	%ymm0, 96\(%rdi,%rax\)
+**	subq	\$-128, %rax
+**	cmpq	%rsi, %rax
+**	jb	.L10
+**.L19:
+**	vzeroupper
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L27:
+**	movb	\$0, \(%rdi\)
+**	testb	\$2, %sil
+**	je	.L20
+**	xorl	%eax, %eax
+**	movw	%ax, -2\(%rdi,%rsi\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L22:
+**	vmovdqu	%ymm0, \(%rdi\)
+**	vmovdqu	%ymm0, 32\(%rdi\)
+**	vmovdqu	%ymm0, -64\(%rdi,%rsi\)
+**	vmovdqu	%ymm0, -32\(%rdi,%rsi\)
+**	vzeroupper
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L23:
+**	vmovdqu	%ymm0, \(%rdi\)
+**	vmovdqu	%ymm0, -32\(%rdi,%rsi\)
+**	vzeroupper
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L24:
+**	vmovdqu	%xmm0, \(%rdi\)
+**	vmovdqu	%xmm0, -16\(%rdi,%rsi\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L25:
+**	movq	\$0, \(%rdi\)
+**	movq	\$0, -8\(%rdi,%rsi\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L26:
+**	movl	\$0, \(%rdi\)
+**	movl	\$0, -4\(%rdi,%rsi\)
+**	ret
+**	.cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, 0, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-16.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-16.c
new file mode 100644
index 00000000000..c519bf36fb0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-16.c
@@ -0,0 +1,112 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+**	.cfi_startproc
+**	vpxor	%xmm0, %xmm0, %xmm0
+**	cmpq	\$256, %rsi
+**	jnb	.L2
+**	testb	\$-128, %sil
+**	jne	.L23
+**	testb	\$64, %sil
+**	jne	.L24
+**	testb	\$32, %sil
+**	jne	.L25
+**	testb	\$16, %sil
+**	jne	.L26
+**	testb	\$8, %sil
+**	jne	.L27
+**	testb	\$4, %sil
+**	jne	.L28
+**	testq	%rsi, %rsi
+**	jne	.L29
+**.L21:
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L2:
+**	vmovdqu64	%zmm0, -256\(%rdi,%rsi\)
+**	vmovdqu64	%zmm0, -192\(%rdi,%rsi\)
+**	vmovdqu64	%zmm0, -128\(%rdi,%rsi\)
+**	vmovdqu64	%zmm0, -64\(%rdi,%rsi\)
+**	subq	\$1, %rsi
+**	cmpq	\$256, %rsi
+**	jb	.L20
+**	xorb	%sil, %sil
+**	xorl	%eax, %eax
+**.L11:
+**	vmovdqu64	%zmm0, \(%rdi,%rax\)
+**	vmovdqu64	%zmm0, 64\(%rdi,%rax\)
+**	vmovdqu64	%zmm0, 128\(%rdi,%rax\)
+**	vmovdqu64	%zmm0, 192\(%rdi,%rax\)
+**	addq	\$256, %rax
+**	cmpq	%rsi, %rax
+**	jb	.L11
+**.L20:
+**	vzeroupper
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L29:
+**	movb	\$0, \(%rdi\)
+**	testb	\$2, %sil
+**	je	.L21
+**	xorl	%eax, %eax
+**	movw	%ax, -2\(%rdi,%rsi\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L23:
+**	vmovdqu64	%zmm0, \(%rdi\)
+**	vmovdqu64	%zmm0, 64\(%rdi\)
+**	vmovdqu64	%zmm0, -128\(%rdi,%rsi\)
+**	vmovdqu64	%zmm0, -64\(%rdi,%rsi\)
+**	vzeroupper
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L24:
+**	vmovdqu64	%zmm0, \(%rdi\)
+**	vmovdqu64	%zmm0, -64\(%rdi,%rsi\)
+**	vzeroupper
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L25:
+**	vmovdqu	%ymm0, \(%rdi\)
+**	vmovdqu	%ymm0, -32\(%rdi,%rsi\)
+**	vzeroupper
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L26:
+**	vmovdqu	%xmm0, \(%rdi\)
+**	vmovdqu	%xmm0, -16\(%rdi,%rsi\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L27:
+**	movq	\$0, \(%rdi\)
+**	movq	\$0, -8\(%rdi,%rsi\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L28:
+**	movl	\$0, \(%rdi\)
+**	movl	\$0, -4\(%rdi,%rsi\)
+**	ret
+**	.cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, 0, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-17.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-17.c
new file mode 100644
index 00000000000..744184c44af
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-17.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	pxor	%xmm0, %xmm0
+**	xorl	%eax, %eax
+**.L[0-9]+:
+**	movl	%eax, %edx
+**	addl	\$64, %eax
+**	movaps	%xmm0, dest\(%rdx\)
+**	movaps	%xmm0, dest\+16\(%rdx\)
+**	movaps	%xmm0, dest\+32\(%rdx\)
+**	movaps	%xmm0, dest\+48\(%rdx\)
+**	cmpl	\$128, %eax
+**	jb	.L[0-9]+
+**	movq	\$0, dest\+48\(%rax\)
+**	movaps	%xmm0, dest\(%rax\)
+**	movaps	%xmm0, dest\+16\(%rax\)
+**	movaps	%xmm0, dest\+32\(%rax\)
+**	ret
+**...
+*/
+
+char dest[184];
+
+void
+foo (void)
+{
+  __builtin_memset (dest, 0, sizeof (dest));
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-18.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-18.c
new file mode 100644
index 00000000000..32f8981b8d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-18.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	pxor	%xmm0, %xmm0
+**	xorl	%eax, %eax
+**.L[0-9]+:
+**	movl	%eax, %edx
+**	addl	\$64, %eax
+**	movaps	%xmm0, dest\(%rdx\)
+**	movaps	%xmm0, dest\+16\(%rdx\)
+**	movaps	%xmm0, dest\+32\(%rdx\)
+**	movaps	%xmm0, dest\+48\(%rdx\)
+**	cmpl	\$128, %eax
+**	jb	.L[0-9]+
+**	movaps	%xmm0, dest\+32\(%rax\)
+**	movaps	%xmm0, dest\(%rax\)
+**	movl	\$0, dest\+47\(%rax\)
+**	movaps	%xmm0, dest\+16\(%rax\)
+**	ret
+**...
+*/
+
+char dest[179];
+
+void
+foo (void)
+{
+  __builtin_memset (dest, 0, sizeof (dest));
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-19.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-19.c
new file mode 100644
index 00000000000..04f9171698f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-19.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	pxor	%xmm0, %xmm0
+**	xorl	%eax, %eax
+**.L[0-9]+:
+**	movl	%eax, %edx
+**	addl	\$64, %eax
+**	movaps	%xmm0, dest\(%rdx\)
+**	movaps	%xmm0, dest\+16\(%rdx\)
+**	movaps	%xmm0, dest\+32\(%rdx\)
+**	movaps	%xmm0, dest\+48\(%rdx\)
+**	cmpl	\$128, %eax
+**	jb	.L[0-9]+
+**	movb	\$0, dest\+48\(%rax\)
+**	movaps	%xmm0, dest\(%rax\)
+**	movaps	%xmm0, dest\+16\(%rax\)
+**	movaps	%xmm0, dest\+32\(%rax\)
+**	ret
+**...
+*/
+
+char dest[177];
+
+void
+foo (void)
+{
+  __builtin_memset (dest, 0, sizeof (dest));
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-2.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-2.c
new file mode 100644
index 00000000000..f7834c08097
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-2.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	vpxor	%xmm0, %xmm0, %xmm0
+**	vmovdqu	%ymm0, 192\(%rdi\)
+**	vmovdqu	%ymm0, \(%rdi\)
+**	vmovdqu	%ymm0, 32\(%rdi\)
+**	vmovdqu	%ymm0, 64\(%rdi\)
+**	vmovdqu	%ymm0, 96\(%rdi\)
+**	vmovdqu	%ymm0, 128\(%rdi\)
+**	vmovdqu	%ymm0, 160\(%rdi\)
+**	vmovdqu	%ymm0, 222\(%rdi\)
+**	vzeroupper
+**	ret
+**...
+*/
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 254);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-20.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-20.c
new file mode 100644
index 00000000000..edece1256e4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-20.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	movd	%edi, %xmm0
+**	punpcklbw	%xmm0, %xmm0
+**	punpcklwd	%xmm0, %xmm0
+**	pshufd	\$0, %xmm0, %xmm0
+**	movaps	%xmm0, dest\+160\(%rip\)
+**	movaps	%xmm0, dest\(%rip\)
+**	movaps	%xmm0, dest\+16\(%rip\)
+**	movaps	%xmm0, dest\+32\(%rip\)
+**	movaps	%xmm0, dest\+48\(%rip\)
+**	movaps	%xmm0, dest\+64\(%rip\)
+**	movaps	%xmm0, dest\+80\(%rip\)
+**	movaps	%xmm0, dest\+96\(%rip\)
+**	movaps	%xmm0, dest\+112\(%rip\)
+**	movaps	%xmm0, dest\+128\(%rip\)
+**	movaps	%xmm0, dest\+144\(%rip\)
+**	movd	%xmm0, dest\+175\(%rip\)
+**	ret
+**...
+*/
+
+char dest[179];
+
+void
+foo (int c)
+{
+  __builtin_memset (dest, c, sizeof (dest));
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-21.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-21.c
new file mode 100644
index 00000000000..a88e109936e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-21.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	movd	%edi, %xmm0
+**	movb	%dil, dest\+176\(%rip\)
+**	punpcklbw	%xmm0, %xmm0
+**	punpcklwd	%xmm0, %xmm0
+**	pshufd	\$0, %xmm0, %xmm0
+**	movaps	%xmm0, dest\(%rip\)
+**	movaps	%xmm0, dest\+16\(%rip\)
+**	movaps	%xmm0, dest\+32\(%rip\)
+**	movaps	%xmm0, dest\+48\(%rip\)
+**	movaps	%xmm0, dest\+64\(%rip\)
+**	movaps	%xmm0, dest\+80\(%rip\)
+**	movaps	%xmm0, dest\+96\(%rip\)
+**	movaps	%xmm0, dest\+112\(%rip\)
+**	movaps	%xmm0, dest\+128\(%rip\)
+**	movaps	%xmm0, dest\+144\(%rip\)
+**	movaps	%xmm0, dest\+160\(%rip\)
+**	ret
+**...
+*/
+
+char dest[177];
+
+void
+foo (int c)
+{
+  __builtin_memset (dest, c, sizeof (dest));
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-22.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-22.c
new file mode 100644
index 00000000000..f2bd69855d1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-22.c
@@ -0,0 +1,27 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=rep_8byte:8192:align,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	movl	\$25, %ecx
+**	xorl	%eax, %eax
+**	movl	\$dest, %edi
+**	rep stosq
+**	movl	\$0, \(%rdi\)
+**	ret
+**...
+*/
+
+#define SIZE 204
+
+char dest[SIZE];
+
+void
+foo (void)
+{
+  __builtin_memset (dest, 0, sizeof (dest));
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-23.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-23.c
new file mode 100644
index 00000000000..784f8dc9919
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-23.c
@@ -0,0 +1,67 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -minline-all-stringops -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+**	.cfi_startproc
+**	movzbl	%dil, %edi
+**	movl	\$p, %eax
+**	movabsq	\$72340172838076673, %rdx
+**	imulq	%rdx, %rdi
+**	movq	%rdi, %xmm0
+**	punpcklqdq	%xmm0, %xmm0
+**	cmpq	\$64, %rsi
+**	jnb	.L18
+**.L2:
+**	movq	%rsi, %rcx
+**	andl	\$63, %ecx
+**	je	.L1
+**	xorl	%edx, %edx
+**	andl	\$1, %esi
+**	je	.L5
+**	movl	\$1, %edx
+**	movb	%dil, \(%rax\)
+**	cmpq	%rcx, %rdx
+**	jnb	.L19
+**.L5:
+**	movb	%dil, \(%rax,%rdx\)
+**	movb	%dil, 1\(%rax,%rdx\)
+**	addq	\$2, %rdx
+**	cmpq	%rcx, %rdx
+**	jb	.L5
+**.L1:
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L18:
+**	movq	%rsi, %rdx
+**	xorl	%eax, %eax
+**	andq	\$-64, %rdx
+**.L3:
+**	movaps	%xmm0, p\(%rax\)
+**	addq	\$64, %rax
+**	movaps	%xmm0, p-48\(%rax\)
+**	movaps	%xmm0, p-32\(%rax\)
+**	movaps	%xmm0, p-16\(%rax\)
+**	cmpq	%rdx, %rax
+**	jb	.L3
+**	addq	\$p, %rax
+**	jmp	.L2
+**.L19:
+**	ret
+**	.cfi_endproc
+**...
+*/
+
+
+#define WRITE_CHUNK 256
+char p[WRITE_CHUNK];
+
+void
+foo (int c, __SIZE_TYPE__ nbyte)
+{
+ __builtin_memset (p, c, nbyte);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-3.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-3.c
new file mode 100644
index 00000000000..621baf7b9fe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-3.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB[0-9]+:
+**	.cfi_startproc
+**	vpxor	%xmm0, %xmm0, %xmm0
+**	vmovdqu8	%zmm0, 128\(%rdi\)
+**	vmovdqu8	%zmm0, \(%rdi\)
+**	vmovdqu8	%zmm0, 64\(%rdi\)
+**	vmovdqu8	%zmm0, 190\(%rdi\)
+**	vzeroupper
+**	ret
+**...
+*/
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 254);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-4.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-4.c
new file mode 100644
index 00000000000..712404be416
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-4.c
@@ -0,0 +1,93 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+**	.cfi_startproc
+**	movabsq	\$289360691352306692, %rax
+**	movq	%rax, %xmm0
+**	punpcklqdq	%xmm0, %xmm0
+**	cmpq	\$64, %rsi
+**	jnb	.L2
+**	testb	\$32, %sil
+**	jne	.L19
+**	testb	\$16, %sil
+**	jne	.L20
+**	testb	\$8, %sil
+**	jne	.L21
+**	testb	\$4, %sil
+**	jne	.L22
+**	testq	%rsi, %rsi
+**	jne	.L23
+**.L1:
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L2:
+**	movups	%xmm0, -64\(%rdi,%rsi\)
+**	movups	%xmm0, -48\(%rdi,%rsi\)
+**	movups	%xmm0, -32\(%rdi,%rsi\)
+**	movups	%xmm0, -16\(%rdi,%rsi\)
+**	subq	\$1, %rsi
+**	cmpq	\$64, %rsi
+**	jb	.L1
+**	andq	\$-64, %rsi
+**	xorl	%eax, %eax
+**.L9:
+**	movups	%xmm0, \(%rdi,%rax\)
+**	movups	%xmm0, 16\(%rdi,%rax\)
+**	movups	%xmm0, 32\(%rdi,%rax\)
+**	movups	%xmm0, 48\(%rdi,%rax\)
+**	addq	\$64, %rax
+**	cmpq	%rsi, %rax
+**	jb	.L9
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L23:
+**	movb	\$4, \(%rdi\)
+**	testb	\$2, %sil
+**	je	.L1
+**	movl	\$1028, %eax
+**	movw	%ax, -2\(%rdi,%rsi\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L19:
+**	movups	%xmm0, \(%rdi\)
+**	movups	%xmm0, 16\(%rdi\)
+**	movups	%xmm0, -32\(%rdi,%rsi\)
+**	movups	%xmm0, -16\(%rdi,%rsi\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L20:
+**	movups	%xmm0, \(%rdi\)
+**	movups	%xmm0, -16\(%rdi,%rsi\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L21:
+**	movq	%rax, \(%rdi\)
+**	movq	%rax, -8\(%rdi,%rsi\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L22:
+**	movl	\$67372036, \(%rdi\)
+**	movl	\$67372036, -4\(%rdi,%rsi\)
+**	ret
+**	.cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, 4, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-5.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-5.c
new file mode 100644
index 00000000000..f597395b38b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-5.c
@@ -0,0 +1,102 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+**	.cfi_startproc
+**	movabsq	\$289360691352306692, %rax
+**	vmovq	%rax, %xmm1
+**	vpbroadcastq	%xmm1, %ymm0
+**	cmpq	\$128, %rsi
+**	jnb	.L2
+**	testb	\$64, %sil
+**	jne	.L21
+**	testb	\$32, %sil
+**	jne	.L22
+**	testb	\$16, %sil
+**	jne	.L23
+**	testb	\$8, %sil
+**	jne	.L24
+**	testb	\$4, %sil
+**	jne	.L25
+**	testq	%rsi, %rsi
+**	jne	.L26
+**.L19:
+**	vzeroupper
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L2:
+**	vmovdqu	%ymm0, -128\(%rdi,%rsi\)
+**	vmovdqu	%ymm0, -96\(%rdi,%rsi\)
+**	vmovdqu	%ymm0, -64\(%rdi,%rsi\)
+**	vmovdqu	%ymm0, -32\(%rdi,%rsi\)
+**	subq	\$1, %rsi
+**	cmpq	\$128, %rsi
+**	jb	.L19
+**	andq	\$-128, %rsi
+**	xorl	%eax, %eax
+**.L10:
+**	vmovdqu	%ymm0, \(%rdi,%rax\)
+**	vmovdqu	%ymm0, 32\(%rdi,%rax\)
+**	vmovdqu	%ymm0, 64\(%rdi,%rax\)
+**	vmovdqu	%ymm0, 96\(%rdi,%rax\)
+**	subq	\$-128, %rax
+**	cmpq	%rsi, %rax
+**	jb	.L10
+**	jmp	.L19
+**	.p2align 4,,10
+**	.p2align 3
+**.L26:
+**	movb	\$4, \(%rdi\)
+**	testb	\$2, %sil
+**	je	.L19
+**	movl	\$1028, %eax
+**	movw	%ax, -2\(%rdi,%rsi\)
+**	jmp	.L19
+**	.p2align 4,,10
+**	.p2align 3
+**.L21:
+**	vmovdqu	%ymm0, \(%rdi\)
+**	vmovdqu	%ymm0, 32\(%rdi\)
+**	vmovdqu	%ymm0, -64\(%rdi,%rsi\)
+**	vmovdqu	%ymm0, -32\(%rdi,%rsi\)
+**	jmp	.L19
+**	.p2align 4,,10
+**	.p2align 3
+**.L22:
+**	vmovdqu	%ymm0, \(%rdi\)
+**	vmovdqu	%ymm0, -32\(%rdi,%rsi\)
+**	jmp	.L19
+**	.p2align 4,,10
+**	.p2align 3
+**.L23:
+**	vmovdqu	%xmm0, \(%rdi\)
+**	vmovdqu	%xmm0, -16\(%rdi,%rsi\)
+**	jmp	.L19
+**	.p2align 4,,10
+**	.p2align 3
+**.L24:
+**	movq	%rax, \(%rdi\)
+**	movq	%rax, -8\(%rdi,%rsi\)
+**	jmp	.L19
+**	.p2align 4,,10
+**	.p2align 3
+**.L25:
+**	movl	\$67372036, \(%rdi\)
+**	movl	\$67372036, -4\(%rdi,%rsi\)
+**	jmp	.L19
+**	.cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, 4, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-6.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-6.c
new file mode 100644
index 00000000000..7ba1b742076
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-6.c
@@ -0,0 +1,109 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+**	.cfi_startproc
+**	movabsq	\$289360691352306692, %rax
+**	vpbroadcastq	%rax, %zmm0
+**	cmpq	\$256, %rsi
+**	jnb	.L2
+**	testb	\$-128, %sil
+**	jne	.L22
+**	testb	\$64, %sil
+**	jne	.L23
+**	testb	\$32, %sil
+**	jne	.L24
+**	testb	\$16, %sil
+**	jne	.L25
+**	testb	\$8, %sil
+**	jne	.L26
+**	testb	\$4, %sil
+**	jne	.L27
+**	testq	%rsi, %rsi
+**	jne	.L28
+**.L20:
+**	vzeroupper
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L2:
+**	vmovdqu64	%zmm0, -256\(%rdi,%rsi\)
+**	vmovdqu64	%zmm0, -192\(%rdi,%rsi\)
+**	vmovdqu64	%zmm0, -128\(%rdi,%rsi\)
+**	vmovdqu64	%zmm0, -64\(%rdi,%rsi\)
+**	subq	\$1, %rsi
+**	cmpq	\$256, %rsi
+**	jb	.L20
+**	xorb	%sil, %sil
+**	xorl	%eax, %eax
+**.L11:
+**	vmovdqu64	%zmm0, \(%rdi,%rax\)
+**	vmovdqu64	%zmm0, 64\(%rdi,%rax\)
+**	vmovdqu64	%zmm0, 128\(%rdi,%rax\)
+**	vmovdqu64	%zmm0, 192\(%rdi,%rax\)
+**	addq	\$256, %rax
+**	cmpq	%rsi, %rax
+**	jb	.L11
+**	jmp	.L20
+**	.p2align 4,,10
+**	.p2align 3
+**.L28:
+**	movb	\$4, \(%rdi\)
+**	testb	\$2, %sil
+**	je	.L20
+**	movl	\$1028, %eax
+**	movw	%ax, -2\(%rdi,%rsi\)
+**	jmp	.L20
+**	.p2align 4,,10
+**	.p2align 3
+**.L22:
+**	vmovdqu64	%zmm0, \(%rdi\)
+**	vmovdqu64	%zmm0, 64\(%rdi\)
+**	vmovdqu64	%zmm0, -128\(%rdi,%rsi\)
+**	vmovdqu64	%zmm0, -64\(%rdi,%rsi\)
+**	jmp	.L20
+**	.p2align 4,,10
+**	.p2align 3
+**.L23:
+**	vmovdqu64	%zmm0, \(%rdi\)
+**	vmovdqu64	%zmm0, -64\(%rdi,%rsi\)
+**	jmp	.L20
+**	.p2align 4,,10
+**	.p2align 3
+**.L24:
+**	vmovdqu	%ymm0, \(%rdi\)
+**	vmovdqu	%ymm0, -32\(%rdi,%rsi\)
+**	jmp	.L20
+**	.p2align 4,,10
+**	.p2align 3
+**.L25:
+**	vmovdqu	%xmm0, \(%rdi\)
+**	vmovdqu	%xmm0, -16\(%rdi,%rsi\)
+**	jmp	.L20
+**	.p2align 4,,10
+**	.p2align 3
+**.L26:
+**	movq	%rax, \(%rdi\)
+**	movq	%rax, -8\(%rdi,%rsi\)
+**	jmp	.L20
+**	.p2align 4,,10
+**	.p2align 3
+**.L27:
+**	movl	\$67372036, \(%rdi\)
+**	movl	\$67372036, -4\(%rdi,%rsi\)
+**	jmp	.L20
+**	.cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, 4, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-7.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-7.c
new file mode 100644
index 00000000000..62f61c54ed0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-7.c
@@ -0,0 +1,94 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+**	.cfi_startproc
+**	movabsq	\$72340172838076673, %rax
+**	movzbl	%sil, %esi
+**	imulq	%rax, %rsi
+**	movq	%rsi, %xmm0
+**	punpcklqdq	%xmm0, %xmm0
+**	cmpq	\$64, %rdx
+**	jnb	.L2
+**	testb	\$32, %dl
+**	jne	.L19
+**	testb	\$16, %dl
+**	jne	.L20
+**	testb	\$8, %dl
+**	jne	.L21
+**	testb	\$4, %dl
+**	jne	.L22
+**	testq	%rdx, %rdx
+**	jne	.L23
+**.L1:
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L2:
+**	movups	%xmm0, -64\(%rdi,%rdx\)
+**	movups	%xmm0, -48\(%rdi,%rdx\)
+**	movups	%xmm0, -32\(%rdi,%rdx\)
+**	movups	%xmm0, -16\(%rdi,%rdx\)
+**	subq	\$1, %rdx
+**	cmpq	\$64, %rdx
+**	jb	.L1
+**	andq	\$-64, %rdx
+**	xorl	%eax, %eax
+**.L9:
+**	movups	%xmm0, \(%rdi,%rax\)
+**	movups	%xmm0, 16\(%rdi,%rax\)
+**	movups	%xmm0, 32\(%rdi,%rax\)
+**	movups	%xmm0, 48\(%rdi,%rax\)
+**	addq	\$64, %rax
+**	cmpq	%rdx, %rax
+**	jb	.L9
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L23:
+**	movb	%sil, \(%rdi\)
+**	testb	\$2, %dl
+**	je	.L1
+**	movw	%si, -2\(%rdi,%rdx\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L19:
+**	movups	%xmm0, \(%rdi\)
+**	movups	%xmm0, 16\(%rdi\)
+**	movups	%xmm0, -32\(%rdi,%rdx\)
+**	movups	%xmm0, -16\(%rdi,%rdx\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L20:
+**	movups	%xmm0, \(%rdi\)
+**	movups	%xmm0, -16\(%rdi,%rdx\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L21:
+**	movq	%rsi, \(%rdi\)
+**	movq	%rsi, -8\(%rdi,%rdx\)
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L22:
+**	movl	%esi, \(%rdi\)
+**	movl	%esi, -4\(%rdi,%rdx\)
+**	ret
+**	.cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, int c, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, c, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-8.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-8.c
new file mode 100644
index 00000000000..d12ab157494
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-8.c
@@ -0,0 +1,103 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+**	.cfi_startproc
+**	movabsq	\$72340172838076673, %rax
+**	movzbl	%sil, %esi
+**	imulq	%rax, %rsi
+**	vmovq	%rsi, %xmm1
+**	vpbroadcastq	%xmm1, %ymm0
+**	cmpq	\$128, %rdx
+**	jnb	.L2
+**	testb	\$64, %dl
+**	jne	.L21
+**	testb	\$32, %dl
+**	jne	.L22
+**	testb	\$16, %dl
+**	jne	.L23
+**	testb	\$8, %dl
+**	jne	.L24
+**	testb	\$4, %dl
+**	jne	.L25
+**	testq	%rdx, %rdx
+**	jne	.L26
+**.L19:
+**	vzeroupper
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L2:
+**	vmovdqu	%ymm0, -128\(%rdi,%rdx\)
+**	vmovdqu	%ymm0, -96\(%rdi,%rdx\)
+**	vmovdqu	%ymm0, -64\(%rdi,%rdx\)
+**	vmovdqu	%ymm0, -32\(%rdi,%rdx\)
+**	subq	\$1, %rdx
+**	cmpq	\$128, %rdx
+**	jb	.L19
+**	andq	\$-128, %rdx
+**	xorl	%eax, %eax
+**.L10:
+**	vmovdqu	%ymm0, \(%rdi,%rax\)
+**	vmovdqu	%ymm0, 32\(%rdi,%rax\)
+**	vmovdqu	%ymm0, 64\(%rdi,%rax\)
+**	vmovdqu	%ymm0, 96\(%rdi,%rax\)
+**	subq	\$-128, %rax
+**	cmpq	%rdx, %rax
+**	jb	.L10
+**	jmp	.L19
+**	.p2align 4,,10
+**	.p2align 3
+**.L26:
+**	movb	%sil, \(%rdi\)
+**	testb	\$2, %dl
+**	je	.L19
+**	movw	%si, -2\(%rdi,%rdx\)
+**	jmp	.L19
+**	.p2align 4,,10
+**	.p2align 3
+**.L21:
+**	vmovdqu	%ymm0, \(%rdi\)
+**	vmovdqu	%ymm0, 32\(%rdi\)
+**	vmovdqu	%ymm0, -64\(%rdi,%rdx\)
+**	vmovdqu	%ymm0, -32\(%rdi,%rdx\)
+**	jmp	.L19
+**	.p2align 4,,10
+**	.p2align 3
+**.L22:
+**	vmovdqu	%ymm0, \(%rdi\)
+**	vmovdqu	%ymm0, -32\(%rdi,%rdx\)
+**	jmp	.L19
+**	.p2align 4,,10
+**	.p2align 3
+**.L23:
+**	vmovdqu	%xmm0, \(%rdi\)
+**	vmovdqu	%xmm0, -16\(%rdi,%rdx\)
+**	jmp	.L19
+**	.p2align 4,,10
+**	.p2align 3
+**.L24:
+**	movq	%rsi, \(%rdi\)
+**	movq	%rsi, -8\(%rdi,%rdx\)
+**	jmp	.L19
+**	.p2align 4,,10
+**	.p2align 3
+**.L25:
+**	movl	%esi, \(%rdi\)
+**	movl	%esi, -4\(%rdi,%rdx\)
+**	jmp	.L19
+**	.cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, int c, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, c, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-pr120683-9.c b/gcc/testsuite/gcc.target/i386/memset-pr120683-9.c
new file mode 100644
index 00000000000..1a0abe6614f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-pr120683-9.c
@@ -0,0 +1,110 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -mmemset-strategy=vector_loop:256:noalign,libcall:-1:noalign -minline-all-stringops" } */
+/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } {^\t?\.} } } */
+
+/*
+**foo:
+**.LFB0:
+**	.cfi_startproc
+**	movabsq	\$72340172838076673, %rax
+**	movzbl	%sil, %esi
+**	imulq	%rax, %rsi
+**	vpbroadcastq	%rsi, %zmm0
+**	cmpq	\$256, %rdx
+**	jnb	.L2
+**	testb	\$-128, %dl
+**	jne	.L22
+**	testb	\$64, %dl
+**	jne	.L23
+**	testb	\$32, %dl
+**	jne	.L24
+**	testb	\$16, %dl
+**	jne	.L25
+**	testb	\$8, %dl
+**	jne	.L26
+**	testb	\$4, %dl
+**	jne	.L27
+**	testq	%rdx, %rdx
+**	jne	.L28
+**.L20:
+**	vzeroupper
+**	ret
+**	.p2align 4,,10
+**	.p2align 3
+**.L2:
+**	vmovdqu64	%zmm0, -256\(%rdi,%rdx\)
+**	vmovdqu64	%zmm0, -192\(%rdi,%rdx\)
+**	vmovdqu64	%zmm0, -128\(%rdi,%rdx\)
+**	vmovdqu64	%zmm0, -64\(%rdi,%rdx\)
+**	subq	\$1, %rdx
+**	cmpq	\$256, %rdx
+**	jb	.L20
+**	xorb	%dl, %dl
+**	xorl	%eax, %eax
+**.L11:
+**	vmovdqu64	%zmm0, \(%rdi,%rax\)
+**	vmovdqu64	%zmm0, 64\(%rdi,%rax\)
+**	vmovdqu64	%zmm0, 128\(%rdi,%rax\)
+**	vmovdqu64	%zmm0, 192\(%rdi,%rax\)
+**	addq	\$256, %rax
+**	cmpq	%rdx, %rax
+**	jb	.L11
+**	jmp	.L20
+**	.p2align 4,,10
+**	.p2align 3
+**.L28:
+**	movb	%sil, \(%rdi\)
+**	testb	\$2, %dl
+**	je	.L20
+**	movw	%si, -2\(%rdi,%rdx\)
+**	jmp	.L20
+**	.p2align 4,,10
+**	.p2align 3
+**.L22:
+**	vmovdqu64	%zmm0, \(%rdi\)
+**	vmovdqu64	%zmm0, 64\(%rdi\)
+**	vmovdqu64	%zmm0, -128\(%rdi,%rdx\)
+**	vmovdqu64	%zmm0, -64\(%rdi,%rdx\)
+**	jmp	.L20
+**	.p2align 4,,10
+**	.p2align 3
+**.L23:
+**	vmovdqu64	%zmm0, \(%rdi\)
+**	vmovdqu64	%zmm0, -64\(%rdi,%rdx\)
+**	jmp	.L20
+**	.p2align 4,,10
+**	.p2align 3
+**.L24:
+**	vmovdqu	%ymm0, \(%rdi\)
+**	vmovdqu	%ymm0, -32\(%rdi,%rdx\)
+**	jmp	.L20
+**	.p2align 4,,10
+**	.p2align 3
+**.L25:
+**	vmovdqu	%xmm0, \(%rdi\)
+**	vmovdqu	%xmm0, -16\(%rdi,%rdx\)
+**	jmp	.L20
+**	.p2align 4,,10
+**	.p2align 3
+**.L26:
+**	movq	%rsi, \(%rdi\)
+**	movq	%rsi, -8\(%rdi,%rdx\)
+**	jmp	.L20
+**	.p2align 4,,10
+**	.p2align 3
+**.L27:
+**	movl	%esi, \(%rdi\)
+**	movl	%esi, -4\(%rdi,%rdx\)
+**	jmp	.L20
+**	.cfi_endproc
+**...
+*/
+
+void
+foo (char *dest, int c, __SIZE_TYPE__ n)
+{
+  __builtin_memset (dest, c, n);
+}
+
+/* { dg-final { scan-assembler-not "rep stos" } } */
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-25.c b/gcc/testsuite/gcc.target/i386/memset-strategy-25.c
index 040439d1671..412d69327a1 100644
--- a/gcc/testsuite/gcc.target/i386/memset-strategy-25.c
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-25.c
@@ -7,7 +7,11 @@
 **foo:
 **.LFB[0-9]+:
 **	.cfi_startproc
+**	movq	\$0, 221\(%rdi\)
 **	xorl	%eax, %eax
+**	movq	\$0, 229\(%rdi\)
+**	movq	\$0, 237\(%rdi\)
+**	movq	\$0, 245\(%rdi\)
 **.L[0-9]+:
 **	movl	%eax, %edx
 **	addl	\$32, %eax
@@ -17,6 +21,7 @@
 **	movq	\$0, 24\(%rdi,%rdx\)
 **	cmpl	\$224, %eax
 **	jb	.L[0-9]+
+**	ret
 **...
 */
 
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-29.c b/gcc/testsuite/gcc.target/i386/memset-strategy-29.c
index 50470eaba6d..4c84d42438f 100644
--- a/gcc/testsuite/gcc.target/i386/memset-strategy-29.c
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-29.c
@@ -8,7 +8,11 @@
 **...
 **.LFB[0-9]+:
 **	.cfi_startproc
+**	movq	\$0, 49\(%rdi\)
 **	xorl	%eax, %eax
+**	movq	\$0, 57\(%rdi\)
+**	movq	\$0, 65\(%rdi\)
+**	movq	\$0, 73\(%rdi\)
 **.L[0-9]+:
 **	movl	%eax, %edx
 **	addl	\$32, %eax
@@ -18,6 +22,7 @@
 **	movq	\$0, 24\(%rdi,%rdx\)
 **	cmpl	\$64, %eax
 **	jb	.L[0-9]+
+**	ret
 **...
 */
 
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-30.c b/gcc/testsuite/gcc.target/i386/memset-strategy-30.c
index ef32a9ce4cd..1648a87aea0 100644
--- a/gcc/testsuite/gcc.target/i386/memset-strategy-30.c
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-30.c
@@ -8,7 +8,11 @@
 **...
 **.LFB[0-9]+:
 **	.cfi_startproc
+**	movq	\$0, 63\(%rdi\)
 **	xorl	%eax, %eax
+**	movq	\$0, 71\(%rdi\)
+**	movq	\$0, 79\(%rdi\)
+**	movq	\$0, 87\(%rdi\)
 **.L[0-9]+:
 **	movl	%eax, %edx
 **	addl	\$32, %eax
@@ -18,6 +22,7 @@
 **	movq	\$0, 24\(%rdi,%rdx\)
 **	cmpl	\$64, %eax
 **	jb	.L[0-9]+
+**	ret
 **...
 */
 
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-31.c b/gcc/testsuite/gcc.target/i386/memset-strategy-31.c
index 17a4df25bb2..4791c4dd17c 100644
--- a/gcc/testsuite/gcc.target/i386/memset-strategy-31.c
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-31.c
@@ -9,6 +9,10 @@
 **...
 **	pxor	%xmm0, %xmm0
 **	xorl	%eax, %eax
+**	movups	%xmm0, 190\(%rdi\)
+**	movups	%xmm0, 206\(%rdi\)
+**	movups	%xmm0, 222\(%rdi\)
+**	movups	%xmm0, 238\(%rdi\)
 **.L[0-9]+:
 **	movl	%eax, %edx
 **	addl	\$64, %eax
-- 
2.49.0

Reply via email to