Ping.

Thanks,
Wei.

On Wed, Apr 9, 2014 at 5:18 PM, Wei Mi <w...@google.com> wrote:
> Hi,
>
> For the testcase 1.c
>
> #include <emmintrin.h>
>
> double a[1000];
>
> __m128d foo1() {
>   __m128d res;
>   res = _mm_load_sd(&a[1]);
>   res = _mm_loadh_pd(res, &a[2]);
>   return res;
> }
>
> llvm will merge movsd/movhpd to movupd while gcc will not. The merge
> is beneficial on x86 machines starting from Nehalem.
>
> The patch is to add the merging in peephole.
> bootstrap and regression pass. Is it ok for stage1?
>
> Thanks,
> Wei.
>
> gcc/ChangeLog:
>
> 2014-04-09  Wei Mi  <w...@google.com>
>
>         * config/i386/i386.c (get_memref_parts): New function.
>         (adjacent_mem_locations): Ditto.
>         * config/i386/i386-protos.h: Add decl for adjacent_mem_locations.
>         * config/i386/sse.md: Add define_peephole rule.
>
> gcc/testsuite/ChangeLog:
>
> 2014-04-09  Wei Mi  <w...@google.com>
>
>         * gcc.target/i386/sse2-unaligned-mov.c: New test.
>
> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> index 6e32978..3ae0d6d 100644
> --- a/gcc/config/i386/i386-protos.h
> +++ b/gcc/config/i386/i386-protos.h
> @@ -312,6 +312,7 @@ extern enum attr_cpu ix86_schedule;
>  #endif
>
>  extern const char * ix86_output_call_insn (rtx insn, rtx call_op);
> +extern bool adjacent_mem_locations (rtx mem1, rtx mem2);
>
>  #ifdef RTX_CODE
>  /* Target data for multipass lookahead scheduling.
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 3eefe4a..a330e84 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -46737,6 +46737,70 @@ ix86_atomic_assign_expand_fenv (tree *hold,
> tree *clear, tree *update)
>                     atomic_feraiseexcept_call);
>  }
>
> +/* Try to determine BASE/OFFSET/SIZE parts of the given MEM.
> +   Return true if successful, false if all the values couldn't
> +   be determined.
> +
> +   This function only looks for REG/SYMBOL or REG/SYMBOL+CONST
> +   address forms. */
> +
> +static bool
> +get_memref_parts (rtx mem, rtx *base, HOST_WIDE_INT *offset,
> +                 HOST_WIDE_INT *size)
> +{
> +  rtx addr_rtx;
> +  if MEM_SIZE_KNOWN_P (mem)
> +    *size = MEM_SIZE (mem);
> +  else
> +    return false;
> +
> +  if (GET_CODE (XEXP (mem, 0)) == CONST)
> +    addr_rtx = XEXP (XEXP (mem, 0), 0);
> +  else
> +    addr_rtx = (XEXP (mem, 0));
> +
> +  if (GET_CODE (addr_rtx) == REG
> +      || GET_CODE (addr_rtx) == SYMBOL_REF)
> +    {
> +      *base = addr_rtx;
> +      *offset = 0;
> +    }
> +  else if (GET_CODE (addr_rtx) == PLUS
> +          && CONST_INT_P (XEXP (addr_rtx, 1)))
> +    {
> +      *base = XEXP (addr_rtx, 0);
> +      *offset = INTVAL (XEXP (addr_rtx, 1));
> +    }
> +  else
> +    return false;
> +
> +  return true;
> +}
> +
> +/* If MEM1 is adjacent to MEM2 and MEM1 has lower address,
> +   return true.  */
> +
> +extern bool
> +adjacent_mem_locations (rtx mem1, rtx mem2)
> +{
> +  rtx base1, base2;
> +  HOST_WIDE_INT off1, size1, off2, size2;
> +
> +  if (get_memref_parts (mem1, &base1, &off1, &size1)
> +      && get_memref_parts (mem2, &base2, &off2, &size2))
> +    {
> +      if (GET_CODE (base1) == SYMBOL_REF
> +         && GET_CODE (base2) == SYMBOL_REF
> +         && SYMBOL_REF_DECL (base1) == SYMBOL_REF_DECL (base2))
> +        return (off1 + size1 == off2);
> +      else if (REG_P (base1)
> +              && REG_P (base2)
> +              && REGNO (base1) == REGNO (base2))
> +        return (off1 + size1 == off2);
> +    }
> +  return false;
> +}
> +
>  /* Initialize the GCC target structure.  */
>  #undef TARGET_RETURN_IN_MEMORY
>  #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 72a4d6d..4bf8461 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -15606,3 +15606,37 @@
>    [(set_attr "type" "sselog1")
>     (set_attr "length_immediate" "1")
>     (set_attr "mode" "TI")])
> +
> +;; merge movsd/movhpd to movupd when TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
> +;; is true.
> +(define_peephole2
> +  [(set (match_operand:DF 0 "register_operand")
> +       (match_operand:DF 1 "memory_operand"))
> +   (set (match_operand:V2DF 2 "register_operand")
> +       (vec_concat:V2DF (match_dup 0)
> +        (match_operand:DF 3 "memory_operand")))]
> +  "TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
> +   && REGNO (operands[0]) == REGNO (operands[2])
> +   && adjacent_mem_locations (operands[1], operands[3])"
> +  [(set (match_dup 2)
> +       (unspec:V2DF [(match_dup 4)] UNSPEC_LOADU))]
> +{
> +  operands[4] = gen_rtx_MEM (V2DFmode, XEXP(operands[1], 0));
> +})
> +
> +;; merge movsd/movhpd to movupd when TARGET_SSE_UNALIGNED_STORE_OPTIMAL
> +;; is true.
> +(define_peephole2
> +  [(set (match_operand:DF 0 "memory_operand")
> +        (vec_select:DF (match_operand:V2DF 1 "register_operand")
> +                      (parallel [(const_int 0)])))
> +   (set (match_operand:DF 2 "memory_operand")
> +        (vec_select:DF (match_dup 1)
> +                       (parallel [(const_int 1)])))]
> +  "TARGET_SSE_UNALIGNED_STORE_OPTIMAL
> +   && adjacent_mem_locations (operands[0], operands[2])"
> +  [(set (match_dup 3)
> +        (unspec:V2DF [(match_dup 1)] UNSPEC_STOREU))]
> +{
> +  operands[3] = gen_rtx_MEM (V2DFmode, XEXP(operands[0], 0));
> +})
> diff --git a/gcc/testsuite/gcc.target/i386/sse2-unaligned-mov.c
> b/gcc/testsuite/gcc.target/i386/sse2-unaligned-mov.c
> new file mode 100644
> index 0000000..28470ce
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/sse2-unaligned-mov.c
> @@ -0,0 +1,20 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mtune=corei7 -O2" } */
> +
> +#include <emmintrin.h>
> +
> +double a[1000];
> +
> +__m128d foo1() {
> +  __m128d res;
> +  res = _mm_load_sd(&a[1]);
> +  res = _mm_loadh_pd(res, &a[2]);
> +  return res;
> +}
> +
> +void foo2(__m128d res) {
> +  _mm_store_sd(&a[1], res);
> +  _mm_storeh_pd(&a[2], res);
> +}
> +
> +/* { dg-final { scan-assembler-times "movup" 2 } } */

Reply via email to