Hi, Uros, Could you please check the change compared to the previous version for i386.c as following: Let me know any issue there.
Thanks a lot. Qing --- gcc/config/i386/i386.c | 136 ++++++++++++++++++--- .../gcc.target/i386/zero-scratch-regs-28.c | 17 +++ .../gcc.target/i386/zero-scratch-regs-29.c | 11 ++ .../gcc.target/i386/zero-scratch-regs-30.c | 11 ++ 4 files changed, 155 insertions(+), 20 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index e66dcf0d587..65f778112d9 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -3554,17 +3554,17 @@ ix86_function_value_regno_p (const unsigned int regno) /* Check whether the register REGNO should be zeroed on X86. When ALL_SSE_ZEROED is true, all SSE registers have been zeroed together, no need to zero it again. - Stack registers (st0-st7) and mm0-mm7 are aliased with each other. - very hard to be zeroed individually, don't zero individual st or - mm registgers. */ + When NEED_ZERO_MMX is true, MMX registers should be cleared. */ static bool zero_call_used_regno_p (const unsigned int regno, - bool all_sse_zeroed) + bool all_sse_zeroed, + bool need_zero_mmx) { return GENERAL_REGNO_P (regno) || (!all_sse_zeroed && SSE_REGNO_P (regno)) - || MASK_REGNO_P (regno); + || MASK_REGNO_P (regno) + || (need_zero_mmx && MMX_REGNO_P (regno)); } /* Return the machine_mode that is used to zero register REGNO. */ @@ -3579,8 +3579,12 @@ zero_call_used_regno_mode (const unsigned int regno) return SImode; else if (SSE_REGNO_P (regno)) return V4SFmode; - else + else if (MASK_REGNO_P (regno)) return HImode; + else if (MMX_REGNO_P (regno)) + return DImode; + else + gcc_unreachable (); } /* Generate a rtx to zero all vector registers together if possible, @@ -3603,7 +3607,7 @@ zero_all_vector_registers (HARD_REG_SET need_zeroed_hardregs) return gen_avx_vzeroall (); } -/* Generate insns to zero all st/mm registers together. +/* Generate insns to zero all st registers together. Return true when zeroing instructions are generated. Assume the number of st registers that are zeroed is num_of_st, we will emit the following sequence to zero them together: @@ -3616,23 +3620,50 @@ zero_all_vector_registers (HARD_REG_SET need_zeroed_hardregs) ... fstp %%st(0); i.e., num_of_st fldz followed by num_of_st fstp to clear the stack - mark stack slots empty. */ + mark stack slots empty. + + How to compute the num_of_st? + There is no direct mapping from stack registers to hard register + numbers. If one stack register need to be cleared, we don't know + where in the stack the value remains. So, if any stack register + need to be cleared, the whole stack should be cleared. However, + x87 stack registers that hold the return value should be excluded. + x87 returns in the top (two for complex values) register, so + num_of_st should be 7/6 when x87 returns, otherwise it will be 8. */ + static bool -zero_all_st_mm_registers (HARD_REG_SET need_zeroed_hardregs) +zero_all_st_registers (HARD_REG_SET need_zeroed_hardregs) { unsigned int num_of_st = 0; for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) - if (STACK_REGNO_P (regno) - && TEST_HARD_REG_BIT (need_zeroed_hardregs, regno) - /* When the corresponding mm register also need to be cleared too. */ - && TEST_HARD_REG_BIT (need_zeroed_hardregs, - (regno - FIRST_STACK_REG + FIRST_MMX_REG))) - num_of_st++; + if ((STACK_REGNO_P (regno) || MMX_REGNO_P (regno)) + && TEST_HARD_REG_BIT (need_zeroed_hardregs, regno)) + { + num_of_st++; + break; + } if (num_of_st == 0) return false; + bool return_with_x87 = false; + return_with_x87 = (crtl->return_rtx + && (GET_CODE (crtl->return_rtx) == REG) + && (STACK_REG_P (crtl->return_rtx))); + + bool complex_return = false; + complex_return = (crtl->return_rtx + && COMPLEX_MODE_P (GET_MODE (crtl->return_rtx))); + + if (return_with_x87) + if (complex_return) + num_of_st = 6; + else + num_of_st = 7; + else + num_of_st = 8; + rtx st_reg = gen_rtx_REG (XFmode, FIRST_STACK_REG); for (unsigned int i = 0; i < num_of_st; i++) emit_insn (gen_rtx_SET (st_reg, CONST0_RTX (XFmode))); @@ -3646,6 +3677,43 @@ zero_all_st_mm_registers (HARD_REG_SET need_zeroed_hardregs) return true; } + +/* When the routine exit with MMX mode, if there is any ST registers + need to be zeroed, we should clear all MMX registers except the + one that holds the return value RET_MMX_REGNO. */ +static bool +zero_all_mm_registers (HARD_REG_SET need_zeroed_hardregs, + unsigned int ret_mmx_regno) +{ + bool need_zero_all_mm = false; + for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (STACK_REGNO_P (regno) + && TEST_HARD_REG_BIT (need_zeroed_hardregs, regno)) + { + need_zero_all_mm = true; + break; + } + + if (!need_zero_all_mm) + return false; + + rtx zero_mmx = NULL_RTX; + machine_mode mode = DImode; + for (unsigned int regno = FIRST_MMX_REG; regno <= LAST_MMX_REG; regno++) + if (regno != ret_mmx_regno) + { + rtx reg = gen_rtx_REG (mode, regno); + if (zero_mmx == NULL_RTX) + { + zero_mmx = reg; + emit_insn (gen_rtx_SET (reg, const0_rtx)); + } + else + emit_move_insn (reg, zero_mmx); + } + return true; +} + /* TARGET_ZERO_CALL_USED_REGS. */ /* Generate a sequence of instructions that zero registers specified by NEED_ZEROED_HARDREGS. Return the ZEROED_HARDREGS that are actually @@ -3655,7 +3723,8 @@ ix86_zero_call_used_regs (HARD_REG_SET need_zeroed_hardregs) { HARD_REG_SET zeroed_hardregs; bool all_sse_zeroed = false; - bool st_zeroed = false; + bool all_st_zeroed = false; + bool all_mm_zeroed = false; /* first, let's see whether we can zero all vector registers together. */ rtx zero_all_vec_insn = zero_all_vector_registers (need_zeroed_hardregs); @@ -3665,24 +3734,42 @@ ix86_zero_call_used_regs (HARD_REG_SET need_zeroed_hardregs) all_sse_zeroed = true; } - /* then, let's see whether we can zero all st+mm registers togeter. */ - st_zeroed = zero_all_st_mm_registers (need_zeroed_hardregs); + /* Then, decide which mode (MMX mode or x87 mode) the function exit with. + In order to decide whether we need to clear the MMX registers or the + stack registers. */ + + bool exit_with_mmx_mode = (crtl->return_rtx + && (GET_CODE (crtl->return_rtx) == REG) + && (MMX_REG_P (crtl->return_rtx))); + + /* then, let's see whether we can zero all st registers together. */ + if (!exit_with_mmx_mode) + all_st_zeroed = zero_all_st_registers (need_zeroed_hardregs); + /* Or should we zero all MMX registers. */ + else + { + unsigned int exit_mmx_regno = REGNO (crtl->return_rtx); + all_mm_zeroed = zero_all_mm_registers (need_zeroed_hardregs, + exit_mmx_regno); + } /* Now, generate instructions to zero all the registers. */ CLEAR_HARD_REG_SET (zeroed_hardregs); - if (st_zeroed) + if (all_st_zeroed) SET_HARD_REG_BIT (zeroed_hardregs, FIRST_STACK_REG); rtx zero_gpr = NULL_RTX; rtx zero_vector = NULL_RTX; rtx zero_mask = NULL_RTX; + rtx zero_mmx = NULL_RTX; for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) { if (!TEST_HARD_REG_BIT (need_zeroed_hardregs, regno)) continue; - if (!zero_call_used_regno_p (regno, all_sse_zeroed)) + if (!zero_call_used_regno_p (regno, all_sse_zeroed, + exit_with_mmx_mode && !all_mm_zeroed)) continue; SET_HARD_REG_BIT (zeroed_hardregs, regno); @@ -3728,6 +3815,15 @@ ix86_zero_call_used_regs (HARD_REG_SET need_zeroed_hardregs) } else emit_move_insn (reg, zero_mask); + else if (mode == DImode) + if (zero_mmx == NULL_RTX) + { + zero_mmx = reg; + tmp = gen_rtx_SET (reg, const0_rtx); + emit_insn (tmp); + } + else + emit_move_insn (reg, zero_mmx); else gcc_unreachable (); } diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c new file mode 100644 index 00000000000..61c0bb7a35c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c @@ -0,0 +1,17 @@ +/* { dg-do compile { target *-*-linux* } } */ +/* { dg-options "-O2 -m32 -mmmx -fzero-call-used-regs=all" } */ + +typedef int __v2si __attribute__ ((vector_size (8))); + +__v2si ret_mmx (void) +{ + return (__v2si) { 123, 345 }; +} + +/* { dg-final { scan-assembler "pxor\[ \t\]*%mm1, %mm1" } } */ +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm2" } } */ +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm3" } } */ +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm4" } } */ +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm5" } } */ +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm6" } } */ +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm7" } } */ diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c new file mode 100644 index 00000000000..db636654e70 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c @@ -0,0 +1,11 @@ +/* { dg-do compile { target *-*-linux* } } */ +/* { dg-options "-O2 -m32 -mmmx -fzero-call-used-regs=all" } */ +typedef int __v2si __attribute__ ((vector_size (8))); + +long double ret_x87 (void) +{ + return 1.1L; +} + +/* { dg-final { scan-assembler-times "fldz" 7 } } */ +/* { dg-final { scan-assembler-times "fstp" 7 } } */ diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c new file mode 100644 index 00000000000..7c20b569bfa --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c @@ -0,0 +1,11 @@ +/* { dg-do compile { target *-*-linux* } } */ +/* { dg-options "-O2 -fzero-call-used-regs=all" } */ +typedef int __v2si __attribute__ ((vector_size (8))); + +_Complex long double ret_x87_cplx (void) +{ + return 1.1L + 1.2iL; +} + +/* { dg-final { scan-assembler-times "fldz" 6 } } */ +/* { dg-final { scan-assembler-times "fstp" 6 } } */ -- 2.11.0 > On Oct 26, 2020, at 4:23 PM, Qing Zhao via Gcc-patches > <gcc-patches@gcc.gnu.org> wrote: > > > >> On Oct 26, 2020, at 3:33 PM, Uros Bizjak <ubiz...@gmail.com> wrote: >> >> On Mon, Oct 26, 2020 at 9:05 PM Uros Bizjak <ubiz...@gmail.com> wrote: >>> >>> On Mon, Oct 26, 2020 at 8:10 PM Qing Zhao <qing.z...@oracle.com> wrote: >>>> >>>> >>>> >>>>> On Oct 26, 2020, at 1:42 PM, Uros Bizjak <ubiz...@gmail.com> wrote: >>>>> >>>>> On Mon, Oct 26, 2020 at 6:30 PM Qing Zhao <qing.z...@oracle.com> wrote: >>>>>> >>>>>> >>>>>> The following is the current change in i386.c, could you check whether >>>>>> the logic is good? >>>>> >>>>> x87 handling looks good to me. >>>>> >>>>> One remaining question: If the function uses MMX regs (either >>>>> internally or as an argument register), but exits in x87 mode, does >>>>> your logic clear the x87 stack? >>>> >>>> Yes but not completely yes. >>>> >>>> FIRST, As following: >>>> >>>> /* Then, decide which mode (MMX mode or x87 mode) the function exit with. >>>> In order to decide whether we need to clear the MMX registers or the >>>> stack registers. */ >>>> bool exit_with_mmx_mode = false; >>>> >>>> exit_with_mmx_mode = ((GET_CODE (crtl->return_rtx) == REG) >>>> && (MMX_REG_P (crtl->return_rtx))); >>>> >>>> /* then, let's see whether we can zero all st registers togeter. */ >>>> if (!exit_with_mmx_mode) >>>> st_zeroed = zero_all_st_registers (need_zeroed_hardregs); >>>> >>>> >>>> We first check whether this routine exit with mmx mode, if Not then it’s >>>> X87 mode >>>> (at exit, “EMMS” should already been called per ABI), then >>>> The st/mm registers will be cleared as x87 stack registers. >>>> >>>> However, within the routine “zero_all_st_registers”: >>>> >>>> static bool >>>> zero_all_st_registers (HARD_REG_SET need_zeroed_hardregs) >>>> { >>>> unsigned int num_of_st = 0; >>>> for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) >>>> if (STACK_REGNO_P (regno) >>>> && TEST_HARD_REG_BIT (need_zeroed_hardregs, regno)) >>>> { >>>> num_of_st++; >>>> break; >>>> } >>>> >>>> if (num_of_st == 0) >>>> return false; >>>> >>>> >>>> In the above, I currently only check whether any “Stack” registers need to >>>> be zeroed or not. >>>> But looks like we should also check any “MMX” register need to be zeroed >>>> or not too. If there is any >>>> “MMX” register need to be zeroed, we still need to clear the whole X87 >>>> stack? >>> >>> I think so, but I have to check the details. >> >> Please compile the following testcase with "-m32 -mmmx": >> >> --cut here-- >> #include <stdio.h> >> >> typedef int __v2si __attribute__ ((vector_size (8))); >> >> __v2si zzz; >> >> void >> __attribute__ ((noinline)) >> mmx (__v2si a, __v2si b, __v2si c) >> { >> __v2si res; >> >> res = __builtin_ia32_paddd (a, b); >> zzz = __builtin_ia32_paddd (res, c); >> >> __builtin_ia32_emms (); >> } >> >> >> int main () >> { >> __v2si a = { 123, 345 }; >> __v2si b = { 234, 456 }; >> __v2si c = { 345, 567 }; >> >> mmx (a, b, c); >> >> printf ("%i, %i\n", zzz[0], zzz[1]); >> >> return 0; >> } >> --cut here-- >> >> at the end of mmx() function: >> >> 0x080491ed in mmx () >> (gdb) disass >> Dump of assembler code for function mmx: >> 0x080491e0 <+0>: paddd %mm1,%mm0 >> 0x080491e3 <+3>: paddd %mm2,%mm0 >> 0x080491e6 <+6>: movq %mm0,0x804c020 >> => 0x080491ed <+13>: emms >> 0x080491ef <+15>: ret >> End of assembler dump. >> (gdb) i r flo >> st0 <invalid float value> (raw 0xffff00000558000002be) >> st1 <invalid float value> (raw 0xffff000001c8000000ea) >> st2 <invalid float value> (raw 0xffff0000023700000159) >> st3 0 (raw 0x00000000000000000000) >> st4 0 (raw 0x00000000000000000000) >> st5 0 (raw 0x00000000000000000000) >> st6 0 (raw 0x00000000000000000000) >> st7 0 (raw 0x00000000000000000000) >> fctrl 0x37f 895 >> fstat 0x0 0 >> ftag 0x556a 21866 >> fiseg 0x0 0 >> fioff 0x0 0 >> foseg 0x0 0 >> fooff 0x0 0 >> fop 0x0 0 >> >> There are still values in the MMX registers. However, we are in x87 >> mode, so the whole stack has to be cleared. > > Yes. And I just tried, my current implementation behaved correctly. >> >> Now, what to do if the function uses x87 registers and exits in MMX >> mode? I guess we have to clear all MMX registers (modulo return value >> reg). > > Need to add this part. > > thanks. > Qing >> >> Uros.