Bin, when will the patch for the generic pass be available for review?

David

On Wed, Apr 9, 2014 at 7:27 PM, Bin.Cheng <amker.ch...@gmail.com> wrote:
> On Thu, Apr 10, 2014 at 8:18 AM, Wei Mi <w...@google.com> wrote:
>> Hi,
>>
>> For the testcase 1.c
>>
>> #include <emmintrin.h>
>>
>> double a[1000];
>>
>> __m128d foo1() {
>>   __m128d res;
>>   res = _mm_load_sd(&a[1]);
>>   res = _mm_loadh_pd(res, &a[2]);
>>   return res;
>> }
>>
>> llvm will merge movsd/movhpd to movupd while gcc will not. The merge
>> is beneficial on x86 machines starting from Nehalem.
>>
>> The patch is to add the merging in peephole.
>> bootstrap and regression pass. Is it ok for stage1?
>>
>> Thanks,
>> Wei.
>>
>> gcc/ChangeLog:
>>
>> 2014-04-09  Wei Mi  <w...@google.com>
>>
>>         * config/i386/i386.c (get_memref_parts): New function.
>>         (adjacent_mem_locations): Ditto.
>>         * config/i386/i386-protos.h: Add decl for adjacent_mem_locations.
>>         * config/i386/sse.md: Add define_peephole rule.
>>
>> gcc/testsuite/ChangeLog:
>>
>> 2014-04-09  Wei Mi  <w...@google.com>
>>
>>         * gcc.target/i386/sse2-unaligned-mov.c: New test.
>>
>> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
>> index 6e32978..3ae0d6d 100644
>> --- a/gcc/config/i386/i386-protos.h
>> +++ b/gcc/config/i386/i386-protos.h
>> @@ -312,6 +312,7 @@ extern enum attr_cpu ix86_schedule;
>>  #endif
>>
>>  extern const char * ix86_output_call_insn (rtx insn, rtx call_op);
>> +extern bool adjacent_mem_locations (rtx mem1, rtx mem2);
>>
>>  #ifdef RTX_CODE
>>  /* Target data for multipass lookahead scheduling.
>> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
>> index 3eefe4a..a330e84 100644
>> --- a/gcc/config/i386/i386.c
>> +++ b/gcc/config/i386/i386.c
>> @@ -46737,6 +46737,70 @@ ix86_atomic_assign_expand_fenv (tree *hold,
>> tree *clear, tree *update)
>>                     atomic_feraiseexcept_call);
>>  }
>>
>> +/* Try to determine BASE/OFFSET/SIZE parts of the given MEM.
>> +   Return true if successful, false if all the values couldn't
>> +   be determined.
>> +
>> +   This function only looks for REG/SYMBOL or REG/SYMBOL+CONST
>> +   address forms. */
>> +
>> +static bool
>> +get_memref_parts (rtx mem, rtx *base, HOST_WIDE_INT *offset,
>> +                 HOST_WIDE_INT *size)
>> +{
>> +  rtx addr_rtx;
>> +  if MEM_SIZE_KNOWN_P (mem)
>> +    *size = MEM_SIZE (mem);
>> +  else
>> +    return false;
>> +
>> +  if (GET_CODE (XEXP (mem, 0)) == CONST)
>> +    addr_rtx = XEXP (XEXP (mem, 0), 0);
>> +  else
>> +    addr_rtx = (XEXP (mem, 0));
>> +
>> +  if (GET_CODE (addr_rtx) == REG
>> +      || GET_CODE (addr_rtx) == SYMBOL_REF)
>> +    {
>> +      *base = addr_rtx;
>> +      *offset = 0;
>> +    }
>> +  else if (GET_CODE (addr_rtx) == PLUS
>> +          && CONST_INT_P (XEXP (addr_rtx, 1)))
>> +    {
>> +      *base = XEXP (addr_rtx, 0);
>> +      *offset = INTVAL (XEXP (addr_rtx, 1));
>> +    }
>> +  else
>> +    return false;
>> +
>> +  return true;
>> +}
>> +
>> +/* If MEM1 is adjacent to MEM2 and MEM1 has lower address,
>> +   return true.  */
>> +
>> +extern bool
>> +adjacent_mem_locations (rtx mem1, rtx mem2)
>> +{
>> +  rtx base1, base2;
>> +  HOST_WIDE_INT off1, size1, off2, size2;
>> +
>> +  if (get_memref_parts (mem1, &base1, &off1, &size1)
>> +      && get_memref_parts (mem2, &base2, &off2, &size2))
>> +    {
>> +      if (GET_CODE (base1) == SYMBOL_REF
>> +         && GET_CODE (base2) == SYMBOL_REF
>> +         && SYMBOL_REF_DECL (base1) == SYMBOL_REF_DECL (base2))
>> +        return (off1 + size1 == off2);
>> +      else if (REG_P (base1)
>> +              && REG_P (base2)
>> +              && REGNO (base1) == REGNO (base2))
>> +        return (off1 + size1 == off2);
>> +    }
>> +  return false;
>> +}
>> +
>>  /* Initialize the GCC target structure.  */
>>  #undef TARGET_RETURN_IN_MEMORY
>>  #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
>> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
>> index 72a4d6d..4bf8461 100644
>> --- a/gcc/config/i386/sse.md
>> +++ b/gcc/config/i386/sse.md
>> @@ -15606,3 +15606,37 @@
>>    [(set_attr "type" "sselog1")
>>     (set_attr "length_immediate" "1")
>>     (set_attr "mode" "TI")])
>> +
>> +;; merge movsd/movhpd to movupd when TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
>> +;; is true.
>> +(define_peephole2
>> +  [(set (match_operand:DF 0 "register_operand")
>> +       (match_operand:DF 1 "memory_operand"))
>> +   (set (match_operand:V2DF 2 "register_operand")
>> +       (vec_concat:V2DF (match_dup 0)
>> +        (match_operand:DF 3 "memory_operand")))]
>> +  "TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
>> +   && REGNO (operands[0]) == REGNO (operands[2])
>> +   && adjacent_mem_locations (operands[1], operands[3])"
>> +  [(set (match_dup 2)
>> +       (unspec:V2DF [(match_dup 4)] UNSPEC_LOADU))]
>> +{
>> +  operands[4] = gen_rtx_MEM (V2DFmode, XEXP(operands[1], 0));
>> +})
>> +
>> +;; merge movsd/movhpd to movupd when TARGET_SSE_UNALIGNED_STORE_OPTIMAL
>> +;; is true.
>> +(define_peephole2
>> +  [(set (match_operand:DF 0 "memory_operand")
>> +        (vec_select:DF (match_operand:V2DF 1 "register_operand")
>> +                      (parallel [(const_int 0)])))
>> +   (set (match_operand:DF 2 "memory_operand")
>> +        (vec_select:DF (match_dup 1)
>> +                       (parallel [(const_int 1)])))]
>> +  "TARGET_SSE_UNALIGNED_STORE_OPTIMAL
>> +   && adjacent_mem_locations (operands[0], operands[2])"
>> +  [(set (match_dup 3)
>> +        (unspec:V2DF [(match_dup 1)] UNSPEC_STOREU))]
>> +{
>> +  operands[3] = gen_rtx_MEM (V2DFmode, XEXP(operands[0], 0));
>> +})
>> diff --git a/gcc/testsuite/gcc.target/i386/sse2-unaligned-mov.c
>> b/gcc/testsuite/gcc.target/i386/sse2-unaligned-mov.c
>> new file mode 100644
>> index 0000000..28470ce
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/i386/sse2-unaligned-mov.c
>> @@ -0,0 +1,20 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-mtune=corei7 -O2" } */
>> +
>> +#include <emmintrin.h>
>> +
>> +double a[1000];
>> +
>> +__m128d foo1() {
>> +  __m128d res;
>> +  res = _mm_load_sd(&a[1]);
>> +  res = _mm_loadh_pd(res, &a[2]);
>> +  return res;
>> +}
>> +
>> +void foo2(__m128d res) {
>> +  _mm_store_sd(&a[1], res);
>> +  _mm_storeh_pd(&a[2], res);
>> +}
>> +
>> +/* { dg-final { scan-assembler-times "movup" 2 } } */
>
> Hi Wei,
> Just FYI, though I am not sure whether it can help.  I am working on
> load store pairing on ARM too.  On ARM (maybe other machines like
> mips), the problem is more general because we can pair memory accesses
> with respect to both core register and fpu register.  The current
> strategy taken by GCC is to use peephole2 but it's too weak, for
> example, it can't handle pairs which are intervened by other
> instructions.  Right now I am working on a new pass in GCC to do that
> work, and have already worked out a draft patch.  The instrumental
> data looks promising with many opportunities captured besides the
> original peephole2 pass.  Furthermore, the result could be even better
> if register renaming is enabled.  Hopefully I will try to bring back
> register renaming for this purpose later.
>
> Thanks,
> bin

Reply via email to