PR87507 shows a problem where IRA assigns a non-volatile TImode reg pair to
a pseudo when there is a volatile reg pair available to use.  This then
causes us to emit save/restore code for the non-volatile reg usage.

My first attempt at solving this was to adjust the costs used by non-volatile
registers, but Vlad was hesitant to accept the patch since this is a sensitive
area that can cause performance issues:

  https://gcc.gnu.org/ml/gcc-patches/2018-10/msg00460.html

Since we're late in stage1, I decided to try another solution that doesn't
involve RA at all.  The only reason the register pairs exist at the moment
is that lower-subreg could not decompose them when compiling on ppc in LE
mode.  The reason is that for POWER8, some of the vector loads and stores
do not properly byte swap the values being loaded/stores, so our mov
patterns add an explicit rotate to the rtl insns.  So the following:

  (insn (set (mem:TI (reg/v/f:DI 122))
             (reg/v:TI 123)))

is replaced with:

  (insn (set (reg:TI 129)
             (rotate:TI (reg/v:TI 123) (const_int 64))))

  (insn (set (mem:TI (reg/v/f:DI 122))
             (rotate:TI (reg:TI 129) (const_int 64))))

On BE, we are able to correctly decompose the TImode access into two DImode
accesses, but on LE, lower-subreg doesn't see the accesses as simple moves
anymore, and so fails to decompose them.  However, is we look at what
lower-subreg tries, it sees a:

  (insn (set (concatn/v:TI [(reg:DI 131 [src])
                            (reg:DI 132 [src+8])])
             (rotate:TI (concatn:TI [(reg:DI 133)
                                     (reg:DI 134 [+8])])
                        (const_int 64))))

This is just a word sized rotate of a double word sized register pair
and that is just equivalent to stripping the rotate and swapping the
two registers like so:

  (insn (set (concatn/v:TI [(reg:DI 131 [src])
                            (reg:DI 132 [src+8])])
             (concatn:TI [(reg:DI 134 [+8])
                          (reg:DI 133)])))

The following patch extends lower-subreg to recognize these word sized
rotates as simple moves so that it can replace the rotates with swapped
decomposed registers.

This has passed bootstrap and regtesting on powerpc64le-linux with no
regressions.  Is this ok for mainline?

Peter

gcc/
        PR rtl-optimization/87507
        * lower-subreg.c (simple_move_operator): New function.
        (simple_move): Strip simple operators.
        (find_pseudo_copy): Likewise.
        (resolve_simple_move): Strip simple operators and swap operands.

gcc/testsuite/
        PR rtl-optimization/87507
        * gcc.target/powerpc/pr87507.c: New test.
        * gcc.target/powerpc/pr68805.c: Update expected results.

Index: gcc/lower-subreg.c
===================================================================
--- gcc/lower-subreg.c  (revision 265971)
+++ gcc/lower-subreg.c  (working copy)
@@ -320,6 +320,24 @@ simple_move_operand (rtx x)
   return true;
 }
 
+/* If X is an operator that can be treated as a simple move that we
+   can split, then return the operand that is operated on.  */
+
+static rtx
+simple_move_operator (rtx x)
+{
+  /* A word sized rotate of a register pair is equivalent to swapping
+     the registers in the register pair.  */
+  if (GET_CODE (x) == ROTATE
+      && GET_MODE (x) == twice_word_mode
+      && simple_move_operand (XEXP (x, 0))
+      && CONST_INT_P (XEXP (x, 1))
+      && INTVAL (XEXP (x, 1)) == BITS_PER_WORD)
+    return XEXP (x, 0);;
+
+  return NULL_RTX;
+}
+
 /* If INSN is a single set between two objects that we want to split,
    return the single set.  SPEED_P says whether we are optimizing
    INSN for speed or size.
@@ -330,7 +348,7 @@ simple_move_operand (rtx x)
 static rtx
 simple_move (rtx_insn *insn, bool speed_p)
 {
-  rtx x;
+  rtx x, op;
   rtx set;
   machine_mode mode;
 
@@ -348,6 +366,9 @@ simple_move (rtx_insn *insn, bool speed_
     return NULL_RTX;
 
   x = SET_SRC (set);
+  if ((op = simple_move_operator (x)) != NULL_RTX)
+    x = op;
+
   if (x != recog_data.operand[0] && x != recog_data.operand[1])
     return NULL_RTX;
   /* For the src we can handle ASM_OPERANDS, and it is beneficial for
@@ -386,9 +407,13 @@ find_pseudo_copy (rtx set)
 {
   rtx dest = SET_DEST (set);
   rtx src = SET_SRC (set);
+  rtx op;
   unsigned int rd, rs;
   bitmap b;
 
+  if ((op = simple_move_operator (src)) != NULL_RTX)
+    src = op;
+
   if (!REG_P (dest) || !REG_P (src))
     return false;
 
@@ -853,7 +878,7 @@ can_decompose_p (rtx x)
 static rtx_insn *
 resolve_simple_move (rtx set, rtx_insn *insn)
 {
-  rtx src, dest, real_dest;
+  rtx src, dest, real_dest, src_op;
   rtx_insn *insns;
   machine_mode orig_mode, dest_mode;
   unsigned int orig_size, words;
@@ -876,6 +901,33 @@ resolve_simple_move (rtx set, rtx_insn *
 
   real_dest = NULL_RTX;
 
+  if ((src_op = simple_move_operator (src)) != NULL_RTX)
+    {
+      if (resolve_reg_p (dest))
+       {
+         /* DEST is a CONCATN, so swap its operands and strip
+            SRC's operator.  */
+         rtx concatn = copy_rtx (dest);
+         rtx op0 = XVECEXP (concatn, 0, 0);
+         rtx op1 = XVECEXP (concatn, 0, 1);
+         XVECEXP (concatn, 0, 0) = op1;
+         XVECEXP (concatn, 0, 1) = op0;
+         dest = concatn;
+         src = src_op;
+       }
+      else if (resolve_reg_p (src_op))
+       {
+         /* SRC is an operation on a CONCATN, so strip the operator and
+            swap the CONCATN's operands.  */
+         rtx concatn = copy_rtx (src_op);
+         rtx op0 = XVECEXP (concatn, 0, 0);
+         rtx op1 = XVECEXP (concatn, 0, 1);
+         XVECEXP (concatn, 0, 0) = op1;
+         XVECEXP (concatn, 0, 1) = op0;
+         src = concatn;
+       }
+    }
+
   if (GET_CODE (src) == SUBREG
       && resolve_reg_p (SUBREG_REG (src))
       && (maybe_ne (SUBREG_BYTE (src), 0)
Index: gcc/testsuite/gcc.target/powerpc/pr87507.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/pr87507.c  (nonexistent)
+++ gcc/testsuite/gcc.target/powerpc/pr87507.c  (working copy)
@@ -0,0 +1,22 @@
+/* { dg-do compile { target powerpc64le-*-* } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { 
"-mcpu=power8" } } */
+/* { dg-options "-O2 -mcpu=power8" } */
+
+typedef struct
+{
+  __int128_t x;
+  __int128_t y;
+} foo_t;
+
+void
+foo (long cond, foo_t *dst, __int128_t src)
+{
+  if (cond)
+  {
+    dst->x = src;
+    dst->y = src;
+  }
+}
+
+/* { dg-final { scan-assembler-times {\mstd\M} 4 } } */
+/* { dg-final { scan-assembler-not {\mld\M} } } */
Index: gcc/testsuite/gcc.target/powerpc/pr68805.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/pr68805.c  (revision 265971)
+++ gcc/testsuite/gcc.target/powerpc/pr68805.c  (working copy)
@@ -9,7 +9,7 @@ typedef struct bar {
 
 void foo (TYPE *p, TYPE *q) { *p = *q; }
 
-/* { dg-final { scan-assembler     "lxvd2x"   } } */
-/* { dg-final { scan-assembler     "stxvd2x"  } } */
+/* { dg-final { scan-assembler-times {\mld\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstd\M} 2 } } */
 /* { dg-final { scan-assembler-not "xxpermdi" } } */
 

Reply via email to