On 11/07/18 17:48, Jackson Woodruff wrote:
> Hi Sudi,
> 
> Thanks for the review.
> 
> 
> On 07/10/2018 10:56 AM, Sudakshina wrote:
>> Hi Jackson
>>
>>
>> -  if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
>> +  if (!MEM_P (mem[1]) || aarch64_mem_pair_operand (mem[1], mode))
>>
>> mem_1 == mem[1]?
> Oops, yes... That should be mem[0].
>>
>>      return false;
>>
>> -  /* The mems cannot be volatile.  */
>> ...
>>
>> /* If we have SImode and slow unaligned ldp,
>>       check the alignment to be at least 8 byte. */
>>    if (mode == SImode
>>        && (aarch64_tune_params.extra_tuning_flags
>> -          & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
>> +      & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
>>        && !optimize_size
>> -      && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
>> +      && MEM_ALIGN (mem[1]) < 8 * BITS_PER_UNIT)
>>
>> Likewise
> Done
>> ...
>>    /* Check if the registers are of same class.  */
>> -  if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 !=
>> rclass_4)
>> -    return false;
>> +  for (int i = 0; i < 3; i++)
>>
>> num_instructions -1 instead of 3 would be more consistent.
> Done
>>
>> +    if (rclass[i] != rclass[i + 1])
>> +      return false;
>>
>> It looks good otherwise.
>>
>> Thanks
>> Sudi
> 
> Re-regtested and boostrapped.
> 
> OK for trunk?
> 
> Thanks,
> 
> Jackson
> 
> loops.patch
> 
> 
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 
> 01f35f8e8525adb455780269757452c8c3eb20be..da44b33b2bc12f9aa2122cf5194e244437fb31a5
>  100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -17026,23 +17026,21 @@ bool
>  aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
>                                      scalar_mode mode)
>  {
> -  enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
> -  HOST_WIDE_INT offvals[4], msize;
> -  rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
> -  rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
> +  const int num_instructions = 4;

It's conventional in GCC to use "insn(s)" rather than "instruction(s)";
saves a lot of typing.

> +  enum reg_class rclass[num_instructions];
> +  HOST_WIDE_INT offvals[num_instructions], msize;
> +  rtx mem[num_instructions], reg[num_instructions],
> +      base[num_instructions], offset[num_instructions];
>  
>    if (load)
>      {
> -      reg_1 = operands[0];
> -      mem_1 = operands[1];
> -      reg_2 = operands[2];
> -      mem_2 = operands[3];
> -      reg_3 = operands[4];
> -      mem_3 = operands[5];
> -      reg_4 = operands[6];
> -      mem_4 = operands[7];
> -      gcc_assert (REG_P (reg_1) && REG_P (reg_2)
> -               && REG_P (reg_3) && REG_P (reg_4));
> +      for (int i = 0; i < num_instructions; i++)
> +     {
> +       reg[i] = operands[2 * i];
> +       mem[i] = operands[2 * i + 1];
> +
> +       gcc_assert (REG_P (reg[i]));
> +     }
>  
>        /* Do not attempt to merge the loads if the loads clobber each other.  
> */
>        for (int i = 0; i < 8; i += 2)
> @@ -17051,53 +17049,47 @@ aarch64_operands_adjust_ok_for_ldpstp (rtx 
> *operands, bool load,
>           return false;
>      }
>    else
> -    {
> -      mem_1 = operands[0];
> -      reg_1 = operands[1];
> -      mem_2 = operands[2];
> -      reg_2 = operands[3];
> -      mem_3 = operands[4];
> -      reg_3 = operands[5];
> -      mem_4 = operands[6];
> -      reg_4 = operands[7];
> -    }
> +    for (int i = 0; i < num_instructions; i++)
> +      {
> +     mem[i] = operands[2 * i];
> +     reg[i] = operands[2 * i + 1];
> +      }
> +
>    /* Skip if memory operand is by itslef valid for ldp/stp.  */

Not technically a bug in your patch, but please fix the typo here when
you commit: itslef -> itself.

> -  if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
> +  if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
>      return false;
>  
> -  /* The mems cannot be volatile.  */
> -  if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
> -      || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
> -    return false;
> +  for (int i = 0; i < num_instructions; i++)
> +    {
> +      /* The mems cannot be volatile.  */
> +      if (MEM_VOLATILE_P (mem[i]))
> +     return false;
>  
> -  /* Check if the addresses are in the form of [base+offset].  */
> -  extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
> -  if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
> -    return false;
> -  extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
> -  if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
> -    return false;
> -  extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
> -  if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
> -    return false;
> -  extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
> -  if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
> -    return false;
> +      /* Check if the addresses are in the form of [base+offset].  */
> +      extract_base_offset_in_addr (mem[i], base + i, offset + i);
> +      if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
> +     return false;
> +    }
> +
> +  /* Check if addresses are clobbered by load.  */
> +  if (load)
> +    for (int i = 0; i < num_instructions; i++)
> +      if (reg_mentioned_p (reg[i], mem[i]))
> +     return false;
>  
>    /* Check if the bases are same.  */
> -  if (!rtx_equal_p (base_1, base_2)
> -      || !rtx_equal_p (base_2, base_3)
> -      || !rtx_equal_p (base_3, base_4))
> -    return false;
> +  for (int i = 0; i < num_instructions - 1; i++)
> +    if (!rtx_equal_p (base[i], base[i + 1]))
> +      return false;
> +
> +  for (int i = 0; i < num_instructions; i++)
> +    offvals[i] = INTVAL (offset[i]);
>  
> -  offvals[0] = INTVAL (offset_1);
> -  offvals[1] = INTVAL (offset_2);
> -  offvals[2] = INTVAL (offset_3);
> -  offvals[3] = INTVAL (offset_4);
>    msize = GET_MODE_SIZE (mode);
>  
>    /* Check if the offsets can be put in the right order to do a ldp/stp.  */
> -  qsort (offvals, 4, sizeof (HOST_WIDE_INT), aarch64_host_wide_int_compare);
> +  qsort (offvals, num_instructions, sizeof (HOST_WIDE_INT),
> +      aarch64_host_wide_int_compare);
>  
>    if (!(offvals[1] == offvals[0] + msize
>       && offvals[3] == offvals[2] + msize))
> @@ -17112,45 +17104,25 @@ aarch64_operands_adjust_ok_for_ldpstp (rtx 
> *operands, bool load,
>    if (offvals[0] % msize != offvals[2] % msize)
>      return false;
>  
> -  /* Check if the addresses are clobbered by load.  */
> -  if (load && (reg_mentioned_p (reg_1, mem_1)
> -            || reg_mentioned_p (reg_2, mem_2)
> -            || reg_mentioned_p (reg_3, mem_3)
> -            || reg_mentioned_p (reg_4, mem_4)))
> -    return false;
> -
>    /* If we have SImode and slow unaligned ldp,
>       check the alignment to be at least 8 byte. */
>    if (mode == SImode
>        && (aarch64_tune_params.extra_tuning_flags
> -          & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
> +       & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
>        && !optimize_size
> -      && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
> +      && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
>      return false;
>  
> -  if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
> -    rclass_1 = FP_REGS;
> -  else
> -    rclass_1 = GENERAL_REGS;
> -
> -  if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
> -    rclass_2 = FP_REGS;
> -  else
> -    rclass_2 = GENERAL_REGS;
> -
> -  if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
> -    rclass_3 = FP_REGS;
> -  else
> -    rclass_3 = GENERAL_REGS;
> -
> -  if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
> -    rclass_4 = FP_REGS;
> -  else
> -    rclass_4 = GENERAL_REGS;
> +  for (int i = 0; i < num_instructions; i++)
> +    if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
> +      rclass[i] = FP_REGS;
> +    else
> +      rclass[i] = GENERAL_REGS;
>  
>    /* Check if the registers are of same class.  */
> -  if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
> -    return false;
> +  for (int i = 0; i < num_instructions - 1; i++)
> +    if (rclass[i] != rclass[i + 1])
> +      return false;
>  

Why not initialize rclass to the class of the first register and then
loop once over the remaining elements checking that it matches.  Then
you don't need an array for rclass and you have one fewer loops?

>    return true;
>  }
> 

OK with those changes.

Thanks.

R.

Reply via email to