On Fri, 10 Jun 2016, Richard Biener wrote:

> 
> With the proposed cost change for vector construction we will end up
> vectorizing the testcase in PR68961 again (on x86_64 and likely
> on ppc64le as well after that target gets adjustments).  Currently
> we can't optimize that away again noticing the direct overlap of
> argument and return registers.  The obstackle is
> 
> (insn 7 4 8 2 (set (reg:V2DF 93)
>         (vec_concat:V2DF (reg/v:DF 91 [ a ])
>             (reg/v:DF 92 [ aa ]))) 
> ...
> (insn 21 8 24 2 (set (reg:DI 97 [ D.1756 ])
>         (subreg:DI (reg:TI 88 [ D.1756 ]) 0))
> (insn 24 21 11 2 (set (reg:DI 100 [+8 ])
>         (subreg:DI (reg:TI 88 [ D.1756 ]) 8))
> 
> which we eventually optimize to DFmode subregs of (reg:V2DF 93).
> 
> First of all simplify_subreg doesn't handle the subregs of a vec_concat
> (easy fix below).
> 
> Then combine doesn't like to simplify the multi-use (it tries some
> parallel it seems).  So I went to forwprop which eventually manages
> to do this but throws away the result (reg:DF 91) or (reg:DF 92)
> because it is not a constant.  Thus I allow arbitrary simplification
> results for SUBREGs of [VEC_]CONCAT operations.  There doesn't seem
> to be a magic flag to tell it to restrict to the case where all
> uses can be simplified or so, nor to restrict simplifications to a REG.
> But I don't see any undesirable simplifications of (subreg 
> ([vec_]concat)).
> 
> For the testcase I'm not sure if I have to exclude some ABIs (mingw?).
> 
> Boostrap and regtest in progress on x86_64-unknown-linux-gnu, I'll
> install the simplify-rtx.c if that succeeds but like to have opinions
> on the fwprop.c change.

So the bootstrap exposes a latent issue in simplify-rtx.c in the changed
hunk via gcc.target/i386/mmx-8.c on i?86 which ends up with a 

(vec_concat:V2SI (reg:SI 103)
    (const_int 0 [0]))

and thus a VOIDmode 2nd operand (I'm sure this can happen for
complex integer concat as well, thus latent).  I am adjusting the
simplify_subreg hunk to always pass GET_MODE_INNER (innermode)
(that hopefully exercises it a bit more than just using that
if GET_MODE (part) == VOIDmode - and hopefully they should always
agree).

Re-bootstrap / regtest running on x86_64-unknown-linux-gnu.

Comments still welcome.

Thanks,
Richard.

2016-06-13  Richard Biener  <rguent...@suse.de>

        PR rtl-optimization/68961
        * simplify-rtx.c (simplify_subreg): Handle VEC_CONCAT like CONCAT.
        * fwprop.c (propagate_rtx): Allow SUBREGs of VEC_CONCAT and CONCAT
        to simplify to a non-constant.

        * gcc.target/i386/pr68961.c: New testcase.

Index: gcc/simplify-rtx.c
===================================================================
*** gcc/simplify-rtx.c  (revision 237286)
--- gcc/simplify-rtx.c  (working copy)
*************** simplify_subreg (machine_mode outermode,
*** 6108,6116 ****
        && GET_MODE_SIZE (outermode) <= GET_MODE_SIZE (GET_MODE (op)))
      return adjust_address_nv (op, outermode, byte);
  
!   /* Handle complex values represented as CONCAT
!      of real and imaginary part.  */
!   if (GET_CODE (op) == CONCAT)
      {
        unsigned int part_size, final_offset;
        rtx part, res;
--- 6108,6117 ----
        && GET_MODE_SIZE (outermode) <= GET_MODE_SIZE (GET_MODE (op)))
      return adjust_address_nv (op, outermode, byte);
  
!   /* Handle complex or vector values represented as CONCAT or VEC_CONCAT
!      of two parts.  */
!   if (GET_CODE (op) == CONCAT
!       || GET_CODE (op) == VEC_CONCAT)
      {
        unsigned int part_size, final_offset;
        rtx part, res;
*************** simplify_subreg (machine_mode outermode,
*** 6130,6139 ****
        if (final_offset + GET_MODE_SIZE (outermode) > part_size)
        return NULL_RTX;
  
!       res = simplify_subreg (outermode, part, GET_MODE (part), final_offset);
        if (res)
        return res;
!       if (validate_subreg (outermode, GET_MODE (part), part, final_offset))
        return gen_rtx_SUBREG (outermode, part, final_offset);
        return NULL_RTX;
      }
--- 6131,6141 ----
        if (final_offset + GET_MODE_SIZE (outermode) > part_size)
        return NULL_RTX;
  
!       enum machine_mode part_mode = GET_MODE_INNER (innermode);
!       res = simplify_subreg (outermode, part, part_mode, final_offset);
        if (res)
        return res;
!       if (validate_subreg (outermode, part_mode, part, final_offset))
        return gen_rtx_SUBREG (outermode, part, final_offset);
        return NULL_RTX;
      }
Index: gcc/fwprop.c
===================================================================
*** gcc/fwprop.c        (revision 237286)
--- gcc/fwprop.c        (working copy)
*************** propagate_rtx (rtx x, machine_mode mode,
*** 664,670 ****
        || (GET_CODE (new_rtx) == SUBREG
          && REG_P (SUBREG_REG (new_rtx))
          && (GET_MODE_SIZE (mode)
!             <= GET_MODE_SIZE (GET_MODE (SUBREG_REG (new_rtx))))))
      flags |= PR_CAN_APPEAR;
    if (!varying_mem_p (new_rtx))
      flags |= PR_HANDLE_MEM;
--- 664,673 ----
        || (GET_CODE (new_rtx) == SUBREG
          && REG_P (SUBREG_REG (new_rtx))
          && (GET_MODE_SIZE (mode)
!             <= GET_MODE_SIZE (GET_MODE (SUBREG_REG (new_rtx)))))
!       || ((GET_CODE (new_rtx) == VEC_CONCAT
!          || GET_CODE (new_rtx) == CONCAT)
!         && GET_CODE (x) == SUBREG))
      flags |= PR_CAN_APPEAR;
    if (!varying_mem_p (new_rtx))
      flags |= PR_HANDLE_MEM;
Index: gcc/testsuite/gcc.target/i386/pr68961.c
===================================================================
*** gcc/testsuite/gcc.target/i386/pr68961.c     (revision 0)
--- gcc/testsuite/gcc.target/i386/pr68961.c     (working copy)
***************
*** 0 ****
--- 1,19 ----
+ /* { dg-do compile { target lp64 } } */
+ /* { dg-options "-O3 -fno-vect-cost-model -fdump-tree-slp2-details" } */
+ 
+ struct x { double d[2]; };
+ 
+ struct x
+ pack (double a, double aa)
+ {
+   struct x u;
+   u.d[0] = a;
+   u.d[1] = aa;
+   return u;
+ }
+ 
+ /* The function should be optimized to just return as arguments and
+    result exactly overlap even when previously vectorized.  */
+ 
+ /* { dg-final { scan-tree-dump "basic block vectorized" "slp2" } } */
+ /* { dg-final { scan-assembler-not "mov" } } */

Reply via email to