The following is the current change in i386.c, could you check whether the 
logic is good?

thanks.

Qing 

/* Check whether the register REGNO should be zeroed on X86.
   When ALL_SSE_ZEROED is true, all SSE registers have been zeroed
   together, no need to zero it again.
   When EXIT_WITH_MMX_MODE is true, MMX registers should be cleared.  */

static bool
zero_call_used_regno_p (const unsigned int regno,
                        bool all_sse_zeroed,
                        bool exit_with_mmx_mode)
{
  return GENERAL_REGNO_P (regno)
         || (!all_sse_zeroed && SSE_REGNO_P (regno))
         || MASK_REGNO_P (regno)
         || exit_with_mmx_mode && MMX_REGNO_P (regno);
}

/* Return the machine_mode that is used to zero register REGNO.  */

static machine_mode
zero_call_used_regno_mode (const unsigned int regno)
{
  /* NB: We only need to zero the lower 32 bits for integer registers
     and the lower 128 bits for vector registers since destination are
     zero-extended to the full register width.  */
  if (GENERAL_REGNO_P (regno))
    return SImode;
  else if (SSE_REGNO_P (regno))
    return V4SFmode;
  else if (MASK_REGNO_P (regno))
    return HImode;
  else if (MMX_REGNO_P (regno))
    return V4HImode;
  else
    gcc_unreachable ();
}

/* Generate a rtx to zero all vector registers together if possible,
   otherwise, return NULL.  */

static rtx
zero_all_vector_registers (HARD_REG_SET need_zeroed_hardregs)
{
  if (!TARGET_AVX)
    return NULL;

  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
    if ((IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG)
         || (TARGET_64BIT
             && (REX_SSE_REGNO_P (regno)
                 || (TARGET_AVX512F && EXT_REX_SSE_REGNO_P (regno)))))
        && !TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
      return NULL;

  return gen_avx_vzeroall ();
}


/* Generate insns to zero all st registers together.
   Return true when zeroing instructions are generated.
   Assume the number of st registers that are zeroed is num_of_st,
   we will emit the following sequence to zero them together:
                  fldz;         \
                  fldz;         \
                  ...
                  fldz;         \
                  fstp %%st(0); \
                  fstp %%st(0); \
                  ...
                  fstp %%st(0);
   i.e., num_of_st fldz followed by num_of_st fstp to clear the stack
   mark stack slots empty.

   How to compute the num_of_st?
   There is no direct mapping from stack registers to hard register
   numbers.  If one stack register need to be cleared, we don't know
   where in the stack the value remains.  So, if any stack register
   need to be cleared, the whole stack should be cleared.  However,
   x87 stack registers that hold the return value should be excluded.
   x87 returns in the top (two for complex values) register, so
   num_of_st should be 7/6 when x87 returns, otherwise it will be 8.  */


static bool
zero_all_st_registers (HARD_REG_SET need_zeroed_hardregs)
{
  unsigned int num_of_st = 0;
  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
    if (STACK_REGNO_P (regno)
        && TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
      {
        num_of_st++;
        break;
      }

  if (num_of_st == 0)
    return false;

  bool return_with_x87 = false;
  return_with_x87 = ((GET_CODE (crtl->return_rtx) == REG)
                      && (STACK_REG_P (crtl->return_rtx)));

  bool complex_return = false;
  complex_return = (COMPLEX_MODE_P (GET_MODE (crtl->return_rtx)));

  if (return_with_x87)
    if (complex_return)
      num_of_st = 6;
    else
      num_of_st = 7;
  else
    num_of_st = 8;

  rtx st_reg = gen_rtx_REG (XFmode, FIRST_STACK_REG);

  for (unsigned int i = 0; i < num_of_st; i++)
    emit_insn (gen_rtx_SET (st_reg, CONST0_RTX (XFmode)));

  for (unsigned int i = 0; i < num_of_st; i++)
    {
      rtx insn;
      insn = emit_insn (gen_rtx_SET (st_reg, st_reg));
      add_reg_note (insn, REG_DEAD, st_reg);
    }
  return true;
}

/* TARGET_ZERO_CALL_USED_REGS.  */
/* Generate a sequence of instructions that zero registers specified by
   NEED_ZEROED_HARDREGS.  Return the ZEROED_HARDREGS that are actually
   zeroed.  */
static HARD_REG_SET
ix86_zero_call_used_regs (HARD_REG_SET need_zeroed_hardregs)
{
  HARD_REG_SET zeroed_hardregs;
  bool all_sse_zeroed = false;
  bool st_zeroed = false;

  /* first, let's see whether we can zero all vector registers together.  */
  rtx zero_all_vec_insn = zero_all_vector_registers (need_zeroed_hardregs);
  if (zero_all_vec_insn)
    {
      emit_insn (zero_all_vec_insn);
      all_sse_zeroed = true;
    }

  /* Then, decide which mode (MMX mode or x87 mode) the function exit with.
     In order to decide whether we need to clear the MMX registers or the
     stack registers.  */
  bool exit_with_mmx_mode = false;

  exit_with_mmx_mode = ((GET_CODE (crtl->return_rtx) == REG)
                        && (MMX_REG_P (crtl->return_rtx)));

  /* then, let's see whether we can zero all st registers togeter.  */
  if (!exit_with_mmx_mode)
    st_zeroed = zero_all_st_registers (need_zeroed_hardregs);

  /* Now, generate instructions to zero all the registers.  */

  CLEAR_HARD_REG_SET (zeroed_hardregs);
  if (st_zeroed)
    SET_HARD_REG_BIT (zeroed_hardregs, FIRST_STACK_REG);

  rtx zero_gpr = NULL_RTX;
  rtx zero_vector = NULL_RTX;
  rtx zero_mask = NULL_RTX;

  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
    {
      if (!TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
        continue;
      if (!zero_call_used_regno_p (regno, all_sse_zeroed, exit_with_mmx_mode))
        continue;

      SET_HARD_REG_BIT (zeroed_hardregs, regno);

      rtx reg, tmp;
      machine_mode mode = zero_call_used_regno_mode (regno);

      reg = gen_rtx_REG (mode, regno);

      if (mode == SImode)
        if (zero_gpr == NULL_RTX)
          {
            zero_gpr = reg;
            tmp = gen_rtx_SET (reg, const0_rtx);
            if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
              {
                rtx clob = gen_rtx_CLOBBER (VOIDmode,
                                            gen_rtx_REG (CCmode,
                                                         FLAGS_REG));
                tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2,
                                                             tmp,
                                                             clob));
              }
            emit_insn (tmp);
          }
        else
          emit_move_insn (reg, zero_gpr);
      else if (mode == V4SFmode)
        if (zero_vector == NULL_RTX)
          {
            zero_vector = reg;
            tmp = gen_rtx_SET (reg, const0_rtx);
            emit_insn (tmp);
          }
        else
          emit_move_insn (reg, zero_vector);
      else if (mode == HImode)
        if (zero_mask == NULL_RTX)
          {
            zero_mask = reg;
            tmp = gen_rtx_SET (reg, const0_rtx);
            emit_insn (tmp);
          }
        else
          emit_move_insn (reg, zero_mask);
      else
        gcc_unreachable ();
    }
  return zeroed_hardregs;
}

> On Oct 26, 2020, at 11:13 AM, Uros Bizjak <ubiz...@gmail.com> wrote:
> 
> On Mon, Oct 26, 2020 at 3:45 PM Qing Zhao <qing.z...@oracle.com> wrote:
>> 
>> 
>> +/* Generate insns to zero all st/mm registers together.
>> +   Return true when zeroing instructions are generated.
>> +   Assume the number of st registers that are zeroed is num_of_st,
>> +   we will emit the following sequence to zero them together:
>> +                 fldz;         \
>> +                 fldz;         \
>> +                 ...
>> +                 fldz;         \
>> +                 fstp %%st(0); \
>> +                 fstp %%st(0); \
>> +                 ...
>> +                 fstp %%st(0);
>> +   i.e., num_of_st fldz followed by num_of_st fstp to clear the stack
>> +   mark stack slots empty.  */
>> +
>> +static bool
>> +zero_all_st_mm_registers (HARD_REG_SET need_zeroed_hardregs)
>> +{
>> +  unsigned int num_of_st = 0;
>> +  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
>> +    if (STACK_REGNO_P (regno)
>> +       && TEST_HARD_REG_BIT (need_zeroed_hardregs, regno)
>> +       /* When the corresponding mm register also need to be cleared too.  
>> */
>> +       && TEST_HARD_REG_BIT (need_zeroed_hardregs,
>> +                             (regno - FIRST_STACK_REG + FIRST_MMX_REG)))
>> +      num_of_st++;
>> 
>> 
>> I don't think the above logic is correct. It should go like this:
>> 
>> - If the function is returning an MMX register,
>> 
>> 
>> How to check on this? Is the following correct?
>> 
>> If (GET_CODE(crtl->return_rtx) == REG
>>    && (MMX_REG_P (REGNO (crtl->return_rtx)))
> 
> Yes, but please use
> 
> if (MMX_REG_P (crtl->return_rtx))
> 
>> 
>>   The function is returning an MMX register.
>> 
>> 
>> then the function
>> exits in MMX mode, and MMX registers should be cleared in the same way
>> as XMM registers.
>> 
>> 
>> When clearing XMM registers, we used V4SFmode, what’s the mode we should use 
>> to clearing
>> mmx registers?
> 
> It doesn't matter that much, any 8byte vector mode will do (including
> DImode). Let's use V4HImode.
> 
>> Otherwise the ABI specifies that the function exits
>> in x87 mode and x87 stack should be cleared (but see below).
>> 
>> - There is no direct mapping of stack registers to hard register
>> numbers. If a stack register is used, we don't know where in the stack
>> the value remains. So, if _any_ stack register is touched, the whole
>> stack should be cleared (value, returning in x87 stack register should
>> obviously be excluded).
>> 
>> 
>> Then, how to exclude the x87 stack register that returns the function return 
>> value when we need to
>> Clear the whole stack?
>> I am a little confused here? Could you explain a little more details?
> 
> x87 returns in the top (two for complex values) register, so simply
> load 7 zeros (and 7 corresponding pops). This will preserve the return
> value but clear the whole remaining stack.
> 
>> - There is no x87 argument register. 32bit targets use MMX0-3 argument
>> registers and return value in the XMM register. Please also note that
>> complex values take two stack slots in x87 stack.
>> 
>> 
>> You mean the complex return value will be returned in two  x87 registers?
> 
> Yes, please see ix86_class_max_nregs. Please note that in case of
> complex return value, only 6 zeros should be loaded to avoid
> clobbering the complex return value.
> 
> Uros.

Reply via email to