[RFC PATCH] For TARGET_AVX use *mov_internal for misaligned loads

Jakub Jelinek Wed, 30 Oct 2013 02:48:12 -0700

Hi!

Yesterday I've noticed that for AVX which allows unaligned operands in
AVX arithmetics instructions we still don't combine unaligned loads with the
AVX arithmetics instructions.  So say for -O2 -mavx -ftree-vectorize
void
f1 (int *__restrict e, int *__restrict f)
{
  int i;
  for (i = 0; i < 1024; i++)
    e[i] = f[i] * 7;
}


void
f2 (int *__restrict e, int *__restrict f)
{
  int i;
  for (i = 0; i < 1024; i++)
    e[i] = f[i];
}
we have:
        vmovdqu (%rsi,%rax), %xmm0
        vpmulld %xmm1, %xmm0, %xmm0
        vmovups %xmm0, (%rdi,%rax)
in the first loop.  Apparently all the MODE_VECTOR_INT and MODE_VECTOR_FLOAT
*mov<mode>_internal patterns (and various others) use misaligned_operand
to see if they should emit vmovaps or vmovups (etc.), so as suggested by
Richard on IRC it isn't necessary to either allow UNSPEC_LOADU in memory
operands of all the various non-move AVX instructions for TARGET_AVX, or
add extra patterns to help combine, this patch instead just uses the
*mov<mode>_internal in that case (assuming initially misaligned_operand
doesn't become !misaligned_operand through RTL optimizations).  Additionally
the patch attempts to avoid gen_lowpart on the non-MEM lhs of the unaligned
loads, which usually means combine will fail, by doing the load into a
temporary pseudo in that case and then doing a pseudo to pseudo move with
gen_lowpart on the rhs (which will be merged soon after into following
instructions).

I'll bootstrap/regtest this on x86_64-linux and i686-linux, unfortunately my
bootstrap/regtest server isn't AVX capable.

2013-10-30  Jakub Jelinek  <ja...@redhat.com>

        * config/i386/i386.c (ix86_avx256_split_vector_move_misalign): If
        op1 is misaligned_operand, just use *mov<mode>_internal insn
        rather than UNSPEC_LOADU load.
        (ix86_expand_vector_move_misalign): Likewise (for TARGET_AVX only).
        Avoid gen_lowpart on op0 if it isn't MEM.

--- gcc/config/i386/i386.c.jj   2013-10-30 08:15:38.000000000 +0100
+++ gcc/config/i386/i386.c      2013-10-30 10:20:22.684708729 +0100
@@ -16560,6 +16560,12 @@ ix86_avx256_split_vector_move_misalign (
          r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
          emit_move_insn (op0, r);
        }
+      /* Normal *mov<mode>_internal pattern will handle
+        unaligned loads just fine if misaligned_operand
+        is true, and without the UNSPEC it can be combined
+        with arithmetic instructions.  */
+      else if (misaligned_operand (op1, GET_MODE (op1)))
+       emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
       else
        emit_insn (load_unaligned (op0, op1));
     }
@@ -16634,7 +16640,7 @@ ix86_avx256_split_vector_move_misalign (
 void
 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
 {
-  rtx op0, op1, m;
+  rtx op0, op1, orig_op0 = NULL_RTX, m;
   rtx (*load_unaligned) (rtx, rtx);
   rtx (*store_unaligned) (rtx, rtx);
 
@@ -16647,7 +16653,16 @@ ix86_expand_vector_move_misalign (enum m
        {
        case MODE_VECTOR_INT:
        case MODE_INT:
-         op0 = gen_lowpart (V16SImode, op0);
+         if (GET_MODE (op0) != V16SImode)
+           {
+             if (!MEM_P (op0))
+               {
+                 orig_op0 = op0;
+                 op0 = gen_reg_rtx (V16SImode);
+               }
+             else
+               op0 = gen_lowpart (V16SImode, op0);
+           }
          op1 = gen_lowpart (V16SImode, op1);
          /* FALLTHRU */
 
@@ -16676,6 +16691,8 @@ ix86_expand_vector_move_misalign (enum m
            emit_insn (store_unaligned (op0, op1));
          else
            gcc_unreachable ();
+         if (orig_op0)
+           emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
          break;
 
        default:
@@ -16692,12 +16709,23 @@ ix86_expand_vector_move_misalign (enum m
        {
        case MODE_VECTOR_INT:
        case MODE_INT:
-         op0 = gen_lowpart (V32QImode, op0);
+         if (GET_MODE (op0) != V32QImode)
+           {
+             if (!MEM_P (op0))
+               {
+                 orig_op0 = op0;
+                 op0 = gen_reg_rtx (V32QImode);
+               }
+             else
+               op0 = gen_lowpart (V32QImode, op0);
+           }
          op1 = gen_lowpart (V32QImode, op1);
          /* FALLTHRU */
 
        case MODE_VECTOR_FLOAT:
          ix86_avx256_split_vector_move_misalign (op0, op1);
+         if (orig_op0)
+           emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
          break;
 
        default:
@@ -16709,15 +16737,30 @@ ix86_expand_vector_move_misalign (enum m
 
   if (MEM_P (op1))
     {
+      /* Normal *mov<mode>_internal pattern will handle
+        unaligned loads just fine if misaligned_operand
+        is true, and without the UNSPEC it can be combined
+        with arithmetic instructions.  */
+      if (TARGET_AVX
+         && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
+         && misaligned_operand (op1, GET_MODE (op1)))
+       emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
       /* ??? If we have typed data, then it would appear that using
         movdqu is the only way to get unaligned data loaded with
         integer type.  */
-      if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+      else if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
        {
-         op0 = gen_lowpart (V16QImode, op0);
+         if (GET_MODE (op0) != V16QImode)
+           {
+             orig_op0 = op0;
+             op0 = gen_reg_rtx (V16QImode);
+           }
          op1 = gen_lowpart (V16QImode, op1);
          /* We will eventually emit movups based on insn attributes.  */
          emit_insn (gen_sse2_loaddquv16qi (op0, op1));
+         if (orig_op0)
+           emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
        }
       else if (TARGET_SSE2 && mode == V2DFmode)
         {
@@ -16765,9 +16808,16 @@ ix86_expand_vector_move_misalign (enum m
              || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
              || optimize_insn_for_size_p ())
            {
-             op0 = gen_lowpart (V4SFmode, op0);
+             if (GET_MODE (op0) != V4SFmode)
+               {
+                 orig_op0 = op0;
+                 op0 = gen_reg_rtx (V4SFmode);
+               }
              op1 = gen_lowpart (V4SFmode, op1);
              emit_insn (gen_sse_loadups (op0, op1));
+             if (orig_op0)
+               emit_move_insn (orig_op0,
+                               gen_lowpart (GET_MODE (orig_op0), op0));
              return;
             }
 

        Jakub

[RFC PATCH] For TARGET_AVX use *mov_internal for misaligned loads

Reply via email to