On Fri, Apr 30, 2021 at 04:56:30PM +0100, Richard Sandiford wrote:
> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > On Fri, Apr 30, 2021 at 5:49 AM H.J. Lu <hjl.to...@gmail.com> wrote:
> >>
> >> On Fri, Apr 30, 2021 at 5:42 AM Richard Sandiford
> >> <richard.sandif...@arm.com> wrote:
> >> >
> >> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> >> > > On Fri, Apr 30, 2021 at 2:06 AM Richard Sandiford
> >> > > <richard.sandif...@arm.com> wrote:
> >> > >>
> >> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> >> > >> > gen_reg_rtx tracks stack alignment needed for pseudo registers so 
> >> > >> > that
> >> > >> > associated hard registers can be properly spilled onto stack.  But 
> >> > >> > there
> >> > >> > are cases where associated hard registers will never be spilled onto
> >> > >> > stack.  gen_reg_rtx is changed to take an argument for register 
> >> > >> > alignment
> >> > >> > so that stack realignment can be avoided when not needed.
> >> > >>
> >> > >> How is it guaranteed that they will never be spilled though?
> >> > >> I don't think that that guarantee exists for any kind of pseudo,
> >> > >> except perhaps for the temporary pseudos that the RA creates to
> >> > >> replace (match_scratch …)es.
> >> > >>
> >> > >
> >> > > The caller of creating pseudo registers with specific alignment must
> >> > > guarantee that they will never be spilled.   I am only using it in
> >> > >
> >> > >   /* Make operand1 a register if it isn't already.  */
> >> > >   if (can_create_pseudo_p ()
> >> > >       && !register_operand (op0, mode)
> >> > >       && !register_operand (op1, mode))
> >> > >     {
> >> > >       /* NB: Don't increase stack alignment requirement when forcing
> >> > >          operand1 into a pseudo register to copy data from one memory
> >> > >          location to another since it doesn't require a spill.  */
> >> > >       emit_move_insn (op0,
> >> > >                       force_reg (GET_MODE (op0), op1,
> >> > >                                  (UNITS_PER_WORD * BITS_PER_UNIT)));
> >> > >       return;
> >> > >     }
> >> > >
> >> > > for vector moves.  RA shouldn't spill it.
> >> >
> >> > But this is the point: it's a case of hoping that the RA won't spill it,
> >> > rather than having a guarantee that it won't.
> >> >
> >> > Even if the moves start out adjacent, they could be separated by later
> >> > RTL optimisations, particularly scheduling.  (I realise pre-RA scheduling
> >> > isn't enabled by default for x86, but it can still be enabled 
> >> > explicitly.)
> >> > Or if the same data is being copied to two locations, we might reuse
> >> > values loaded by the first copy for the second copy as well.
> >
> > There are cases where pseudo vector registers are created as pure
> > temporary registers in the backend and they shouldn't ever be spilled
> > to stack.   They will be spilled to stack only if there are other 
> > non-temporary
> > vector register usage in which case stack will be properly re-aligned.
> > Caller of creating pseudo registers with specific alignment guarantees
> > that they are used only as pure temporary registers.
> 
> I don't think there's really a distinct category of pure temporary
> registers though.  The things I mentioned above can happen for any
> kind of pseudo register.
> 

This special pseudo register is only generated when inlining memcpy and
memset.  For memcpy, there is no need to spill:

[hjl@gnu-cfl-2 pieces]$ cat spill1.i
extern void *ops1;
extern void *ops2;

extern void bar (void);

void
foo (void)
{
  __builtin_memcpy (ops1, ops2, 32);
  bar ();
  __builtin_memcpy (ops1, ops2, 32);
}
[hjl@gnu-cfl-2 pieces]$ make spill1.s
/export/build/gnu/tools-build/gcc-gitlab-debug/build-x86_64-linux/gcc/xgcc 
-B/export/build/gnu/tools-build/gcc-gitlab-debug/build-x86_64-linux/gcc/ -O2 
-march=haswell -S spill1.i
[hjl@gnu-cfl-2 pieces]$ cat spill1.s
        .file   "spill1.i"
        .text
        .p2align 4
        .globl  foo
        .type   foo, @function
foo:
.LFB0:
        .cfi_startproc
        subq    $8, %rsp
        .cfi_def_cfa_offset 16
        movq    ops2(%rip), %rax
        vmovdqu (%rax), %ymm0
        movq    ops1(%rip), %rax
        vmovdqu %ymm0, (%rax)
        vzeroupper
        call    bar
        movq    ops2(%rip), %rax
        vmovdqu (%rax), %ymm0
        movq    ops1(%rip), %rax
        vmovdqu %ymm0, (%rax)
        vzeroupper
        addq    $8, %rsp
        .cfi_def_cfa_offset 8
        ret
        .cfi_endproc
.LFE0:
        .size   foo, .-foo
        .ident  "GCC: (GNU) 12.0.0 20210430 (experimental)"
        .section        .note.GNU-stack,"",@progbits
[hjl@gnu-cfl-2 pieces]$

For memeset, x86 backend supports unaligned spill:

[hjl@gnu-cfl-2 pieces]$ cat spill2.i
extern void *ops1;
extern void *ops2;

extern void bar (void);

void
foo (int c)
{
  __builtin_memset (ops1, c, 32);
  bar ();
  __builtin_memset (ops2, c, 32);
}
[hjl@gnu-cfl-2 pieces]$ make spill2.s
/export/build/gnu/tools-build/gcc-gitlab-debug/build-x86_64-linux/gcc/xgcc 
-B/export/build/gnu/tools-build/gcc-gitlab-debug/build-x86_64-linux/gcc/ -O2 
-march=haswell -S spill2.i
[hjl@gnu-cfl-2 pieces]$ cat spill2.s
        .file   "spill2.i"
        .text
        .p2align 4
        .globl  foo
        .type   foo, @function
foo:
.LFB0:
        .cfi_startproc
        subq    $40, %rsp
        .cfi_def_cfa_offset 48
        vmovd   %edi, %xmm0
        movq    ops1(%rip), %rax
        vpbroadcastb    %xmm0, %ymm0
        vmovdqu %ymm0, (%rax)
        vmovdqu %ymm0, (%rsp)
        vzeroupper
        call    bar
        movq    ops2(%rip), %rax
        vmovdqu (%rsp), %ymm0
        vmovdqu %ymm0, (%rax)
        vzeroupper
        addq    $40, %rsp
        .cfi_def_cfa_offset 8
        ret
        .cfi_endproc
.LFE0:
        .size   foo, .-foo
        .ident  "GCC: (GNU) 12.0.0 20210430 (experimental)"
        .section        .note.GNU-stack,"",@progbits
[hjl@gnu-cfl-2 pieces]$


H.J.

Reply via email to