The maximum size of the current op_by_pieces operations are limited by
MAX_FIXED_MODE_SIZE which is an integer expression for the size in bits
of the largest integer machine mode that should actually be used. But
a target can use TImode/OImode/XImode, which can be larger than
MAX_FIXED_MODE_SIZE, to perform op_by_pieces operations. Here are a
set of patches to remove such limitation so that TImode/OImode/XImode
can be used for piecewise move and store:
1. Remove MAX_FIXED_MODE_SIZE limit in alignment_for_piecewise_move.
2. Allow generating pseudo register with specific alignment for hard
registers which will never be spilled onto stack to avoid re-aligning
stack.
3. Add TARGET_READ_MEMSET_VALUE and TARGET_GEN_MEMSET_VALUE to support
target instructions to duplicate QImode value to TImode/OImode/XImode
value for memmset.
4. x86: Avoid stack realignment when copying data
5. x86: Remov MAX_BITSIZE_MODE_ANY_INT. Only x86 backend defines it.
6. x86: Use TImode/OImode/XImode integers for piecewise move and store.
7. x86: Add tests for TImode/OImode/XImode for piecewise move and store.
8. x86: Adjust existing tests.
On x86-64, SPEC CPU 2017 performance impact is neutral. Glibc code size
differences with -O2 build are:
Before After
libc.so 1870718 1870222
ld.so 185120 184984
Some code sequence differences in libc.so are:
Before
After
mov 0x10(%rsp),%edx mov
0x10(%rsp),%edx
mov %edx,(%rax) mov
%edx,(%rax)
movzwl 0x14(%rsp),%edx | mov
0x13(%rsp),%edx
mov %dx,0x4(%rax) | mov
%edx,0x3(%rax)
movzbl 0x16(%rsp),%edx <
mov %dl,0x6(%rax) <
add %rcx,%rax add
%rcx,%rax
ret ret
movdqu (%rsi),%xmm1 | movdqu
(%rcx),%xmm1
mov %rdi,0x20(%rsp) mov
%rdi,0x20(%rsp)
movups %xmm1,(%rax) movups
%xmm1,(%rax)
mov 0x10(%rsi),%rdx | movdqu
0xc(%rcx),%xmm2
mov %rdx,0x10(%rax) | movups
%xmm2,0xc(%rax)
mov 0x18(%rsi),%edx | mov
%rax,(%r14,%rdx,8)
mov %edx,0x18(%rax) | add
$0x1,%rdx
mov %rax,(%r14,%rcx,8) | cmp
%r8,%rdx
add $0x1,%rcx | je
<__resolv_conf_allocate+0x22d>
cmp %r8,%rcx | mov
0x20(%rsp),%rsi
je <__resolv_conf_allocate+0x22f> | mov
(%r9,%rdx,8),%rcx
test %eax,%eax test
%eax,%eax
mov $0xff,%eax mov
$0xff,%eax
cmove %eax,%ebx cmove
%eax,%ebx
movzbl %bl,%ecx | movd
%ebx,%xmm0
mov %ebx,0xc(%rsp) mov
%ebx,0xc(%rsp)
mov %rcx,%rax |
punpcklbw %xmm0,%xmm0
mov %rcx,%rsi |
punpcklwd %xmm0,%xmm0
mul %rdi | pshufd
$0x0,%xmm0,%xmm0
imul %rdi,%rsi | movups
%xmm0,0x50(%r12)
mov %rax,0x50(%r12) | movups
%xmm0,0x60(%r12)
mov %rcx,%rax | movups
%xmm0,0x70(%r12)
add %rdx,%rsi | movups
%xmm0,0x80(%r12)
mul %rdi | movups
%xmm0,0x90(%r12)
mov %rsi,0x58(%r12) | movups
%xmm0,0xa0(%r12)
mov %rsi,0x68(%r12) | movups
%xmm0,0xb0(%r12)
mov %rax,0x60(%r12) | movups
%xmm0,0xc0(%r12)
mov %rcx,%rax | movups
%xmm0,0xd0(%r12)
mul %rdi | movups
%xmm0,0xe0(%r12)
mov %rsi,0x78(%r12) | movups
%xmm0,0xf0(%r12)
mov %rsi,0x88(%r12) | movups
%xmm0,0x100(%r12)
mov %rsi,0x98(%r12) | movups
%xmm0,0x110(%r12)
mov %rax,0x70(%r12) | movups
%xmm0,0x120(%r12)
mov %rcx,%rax | movups
%xmm0,0x130(%r12)
mul %rdi | movups
%xmm0,0x140(%r12)
mov %rsi,0xa8(%r12) <
mov %rsi,0xb8(%r12) <
mov %rsi,0xc8(%r12) <
mov %rax,0x80(%r12) <
mov %rcx,%rax <
mul %rdi <
mov %rsi,0xd8(%r12) <
mov %rsi,0xe8(%r12) <
mov %rsi,0xf8(%r12) <
mov %rax,0x90(%r12) <
mov %rcx,%rax <
mul %rdi <
mov %rsi,0x108(%r12) <
mov %rsi,0x118(%r12) <
mov %rsi,0x128(%r12) <
mov %rax,0xa0(%r12) <
mov %rcx,%rax <
mul %rdi <
mov %rsi,0x138(%r12) <
mov %rax,0xb0(%r12) <
mov %rcx,%rax <
mul %rdi <
mov %rax,0xc0(%r12) <
mov %rcx,%rax <
mul %rdi <
mov %rax,0xd0(%r12) <
mov %rcx,%rax <
mul %rdi <
mov %rax,0xe0(%r12) <
mov %rcx,%rax <
mul %rdi <
mov %rax,0xf0(%r12) <
mov %rcx,%rax <
mul %rdi <
mov %rax,0x100(%r12) <
mov %rcx,%rax <
mul %rdi <
mov %rax,0x110(%r12) <
mov %rcx,%rax <
mul %rdi <
mov %rax,0x120(%r12) <
mov %rcx,%rax <
mul %rdi <
mov %rax,0x130(%r12) <
mov %rcx,%rax <
mul %rdi <
mov %r12,%rdi <
mov %rax,0x140(%r12) <
mov %rsi,0x148(%r12) <
call <xprt_register@GLIBC_2.2.5> call
<xprt_register@GLIBC_2.2.5>
add $0x28,%rsp add
$0x28,%rsp
mov %r12,%rax mov
%r12,%rax
pop %rbx pop
%rbx
pop %rbp pop
%rbp
pop %r12 pop
%r12
pop %r13 pop
%r13
pop %r14 pop
%r14
pop %r15 pop
%r15
ret ret
H.J. Lu (12):
Update alignment_for_piecewise_move
Allow generating pseudo register with specific alignment
Add TARGET_READ_MEMSET_VALUE/TARGET_GEN_MEMSET_VALUE
x86: Avoid stack realignment when copying data
Remove MAX_BITSIZE_MODE_ANY_INT
x86: Update piecewise move and store
x86: Add AVX2 tests for PR middle-end/90773
x86: Add tests for piecewise move and store
x86: Also pass -mno-avx to pr72839.c
x86: Also pass -mno-avx to cold-attribute-1.c
x86: Also pass -mno-avx to sw-1.c for ia32
x86: Update gcc.target/i386/incoming-11.c
gcc/builtins.c | 45 +--
gcc/config/i386/i386-expand.c | 9 +-
gcc/config/i386/i386-modes.def | 15 +-
gcc/config/i386/i386-protos.h | 2 +
gcc/config/i386/i386.c | 257 +++++++++++++++++-
gcc/config/i386/i386.h | 31 ++-
gcc/doc/tm.texi | 16 ++
gcc/doc/tm.texi.in | 4 +
gcc/emit-rtl.c | 5 +-
gcc/explow.c | 6 +-
gcc/explow.h | 2 +-
gcc/expr.c | 13 +-
gcc/expr.h | 6 +-
gcc/rtl.h | 2 +-
gcc/target.def | 20 ++
gcc/targhooks.c | 54 ++++
gcc/targhooks.h | 4 +
.../gcc.target/i386/cold-attribute-1.c | 2 +-
gcc/testsuite/gcc.target/i386/incoming-11.c | 2 +-
.../gcc.target/i386/pieces-memcpy-10.c | 16 ++
.../gcc.target/i386/pieces-memcpy-11.c | 17 ++
.../gcc.target/i386/pieces-memcpy-12.c | 16 ++
.../gcc.target/i386/pieces-memcpy-13.c | 16 ++
.../gcc.target/i386/pieces-memcpy-14.c | 17 ++
.../gcc.target/i386/pieces-memcpy-15.c | 16 ++
.../gcc.target/i386/pieces-memcpy-16.c | 16 ++
.../gcc.target/i386/pieces-memcpy-7.c | 15 +
.../gcc.target/i386/pieces-memcpy-8.c | 14 +
.../gcc.target/i386/pieces-memcpy-9.c | 14 +
.../gcc.target/i386/pieces-memset-1.c | 16 ++
.../gcc.target/i386/pieces-memset-10.c | 16 ++
.../gcc.target/i386/pieces-memset-11.c | 16 ++
.../gcc.target/i386/pieces-memset-12.c | 16 ++
.../gcc.target/i386/pieces-memset-13.c | 16 ++
.../gcc.target/i386/pieces-memset-14.c | 16 ++
.../gcc.target/i386/pieces-memset-15.c | 16 ++
.../gcc.target/i386/pieces-memset-16.c | 16 ++
.../gcc.target/i386/pieces-memset-17.c | 16 ++
.../gcc.target/i386/pieces-memset-18.c | 16 ++
.../gcc.target/i386/pieces-memset-19.c | 17 ++
.../gcc.target/i386/pieces-memset-2.c | 12 +
.../gcc.target/i386/pieces-memset-20.c | 17 ++
.../gcc.target/i386/pieces-memset-21.c | 17 ++
.../gcc.target/i386/pieces-memset-22.c | 17 ++
.../gcc.target/i386/pieces-memset-23.c | 17 ++
.../gcc.target/i386/pieces-memset-24.c | 17 ++
.../gcc.target/i386/pieces-memset-25.c | 17 ++
.../gcc.target/i386/pieces-memset-26.c | 17 ++
.../gcc.target/i386/pieces-memset-27.c | 17 ++
.../gcc.target/i386/pieces-memset-28.c | 17 ++
.../gcc.target/i386/pieces-memset-29.c | 17 ++
.../gcc.target/i386/pieces-memset-3.c | 18 ++
.../gcc.target/i386/pieces-memset-30.c | 17 ++
.../gcc.target/i386/pieces-memset-31.c | 17 ++
.../gcc.target/i386/pieces-memset-32.c | 17 ++
.../gcc.target/i386/pieces-memset-33.c | 17 ++
.../gcc.target/i386/pieces-memset-34.c | 17 ++
.../gcc.target/i386/pieces-memset-35.c | 17 ++
.../gcc.target/i386/pieces-memset-36.c | 17 ++
.../gcc.target/i386/pieces-memset-37.c | 15 +
.../gcc.target/i386/pieces-memset-38.c | 17 ++
.../gcc.target/i386/pieces-memset-39.c | 16 ++
.../gcc.target/i386/pieces-memset-4.c | 16 ++
.../gcc.target/i386/pieces-memset-40.c | 17 ++
.../gcc.target/i386/pieces-memset-41.c | 16 ++
.../gcc.target/i386/pieces-memset-42.c | 17 ++
.../gcc.target/i386/pieces-memset-43.c | 17 ++
.../gcc.target/i386/pieces-memset-5.c | 12 +
.../gcc.target/i386/pieces-memset-6.c | 16 ++
.../gcc.target/i386/pieces-memset-7.c | 16 ++
.../gcc.target/i386/pieces-memset-8.c | 16 ++
.../gcc.target/i386/pieces-memset-9.c | 16 ++
gcc/testsuite/gcc.target/i386/pr72839.c | 2 +-
gcc/testsuite/gcc.target/i386/pr90773-1.c | 10 +-
gcc/testsuite/gcc.target/i386/pr90773-14.c | 2 +-
gcc/testsuite/gcc.target/i386/pr90773-15.c | 14 +
gcc/testsuite/gcc.target/i386/pr90773-16.c | 14 +
gcc/testsuite/gcc.target/i386/pr90773-17.c | 14 +
gcc/testsuite/gcc.target/i386/pr90773-18.c | 15 +
gcc/testsuite/gcc.target/i386/pr90773-19.c | 14 +
gcc/testsuite/gcc.target/i386/pr90773-20.c | 13 +
gcc/testsuite/gcc.target/i386/pr90773-21.c | 13 +
gcc/testsuite/gcc.target/i386/pr90773-22.c | 13 +
gcc/testsuite/gcc.target/i386/pr90773-23.c | 13 +
gcc/testsuite/gcc.target/i386/pr90773-4.c | 2 +-
gcc/testsuite/gcc.target/i386/sw-1.c | 1 +
86 files changed, 1404 insertions(+), 91 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-11.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-12.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-13.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-14.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-15.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-16.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-7.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-8.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-9.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-10.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-11.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-12.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-13.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-14.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-15.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-16.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-17.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-18.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-19.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-20.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-21.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-22.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-23.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-24.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-25.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-26.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-27.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-28.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-29.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-3.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-30.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-31.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-32.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-33.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-34.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-35.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-36.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-37.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-38.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-39.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-4.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-40.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-41.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-42.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-43.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-5.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-6.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-7.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-8.c
create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-9.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-15.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-16.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-17.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-18.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-19.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-20.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-21.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-22.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr90773-23.c
--
2.31.1