https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81673

            Bug ID: 81673
           Summary: Harmful SLP vectorization
           Product: gcc
           Version: 7.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: jamborm at gcc dot gnu.org
  Target Milestone: ---

Revision r238364 has changes the cost of vec_construct in
ix86_builtin_vectorization_cost.  On one hand, the new code correctly
models the number of vecotr inserts, but on the other it has lead to
SLP vectorization in the following testcase which is extracted from
538.imagick_r in which it leads to run-time regressions (depending on
the HW you use) up to 6% at -O3 and -Ofast optimisation levels.

----------------------------------------------------------------------
typedef unsigned long size_t;
typedef long ssize_t;

typedef struct _RectangleInfo
{
  size_t
    width,
    height;

  ssize_t
    x,
    y;
} RectangleInfo;

void bar (RectangleInfo *region);


void
foo (void *ua, void *ub, const ssize_t x,const ssize_t y,
     const size_t columns,const size_t rows)
{
  RectangleInfo region;

  region.x=x;
  region.y=y;
  region.width=columns;
  region.height=rows;

  bar (&region);
}
----------------------------------------------------------------------

SLP2 converts this into:

  vect_cst__14 = {x_2(D), y_4(D)};
  vect_cst__12 = {columns_6(D), rows_8(D)};
  MEM[(long int *)&region + 16B] = vect_cst__14;
  MEM[(long unsigned int *)&region] = vect_cst__12;
  bar (&region);

which is then finally compiled to:

        .cfi_startproc
        subq    $72, %rsp
        .cfi_def_cfa_offset 80
        movq    %rdx, 24(%rsp)
        movq    %rcx, 8(%rsp)
        leaq    32(%rsp), %rdi
        movq    24(%rsp), %xmm0
        movq    %r9, 16(%rsp)
        movhps  8(%rsp), %xmm0
        movq    %r8, 8(%rsp)
        movaps  %xmm0, 48(%rsp)
        movq    8(%rsp), %xmm0
        movhps  16(%rsp), %xmm0
        movaps  %xmm0, 32(%rsp)
        call    bar
        addq    $72, %rsp
        .cfi_def_cfa_offset 8
        ret
        .cfi_endproc

as opposed to the output of the previous revision:

        .cfi_startproc
        subq    $40, %rsp
        .cfi_def_cfa_offset 48
        movq    %rsp, %rdi
        movq    %rdx, 16(%rsp)
        movq    %rcx, 24(%rsp)
        movq    %r8, (%rsp)
        movq    %r9, 8(%rsp)
        call    bar
        addq    $40, %rsp
        .cfi_def_cfa_offset 8
        ret

The moves from GPRs into XMM registersh through stack are the thing
that can cost a lot of time (as shown by perf in the case of
538.imagick_r).

Reply via email to