Jakub Jelinek <[email protected]> wrote on 16/04/2011 09:16:23 AM:
>
> Hi!
>
> As the attached testcase shows, while the current detection of what
> shifts are by scalar and what shifts are by vector shift count
> may work well for loop vectorizer (vect_internal_def being vector
> shift, vect_external_def or vect_constant_def scalar shift),
> it is incorrect for SLP, where vect_external_def and vect_constant_def
> may be either scalar or vector shift (vect_external_def is e.g.
> if def_stmt is in a different bb and either it can be the same
> SSA_NAMEs in all shifts, or different, for vect_constant_def it
> can be either the same constant in all cases, or different)
> and in theory vect_internal_def could be also scalar shift (tried
> to test that in fn3's first bb, though currently SLP doesn't attempt
> to vectorize that, missed-optimization).
>
> The following patch fixes that. Bootstrapped/regtested on x86_64-linux
> and i686-linux and additionally tested on the testcase with additional
> -mxop. Ok for trunk/4.6?
>
> For better test coverage, perhaps the testcase should be duplicated
> after effective target xop and xop_runtime are added and additionally
> perhaps the testcase in gcc.dg/vect with -mxop testing what is SLP
> vectorized from the dumps would be nice.
>
> 2011-04-16 Jakub Jelinek <[email protected]>
>
> PR tree-optimization/48616
> * tree-vect-stmts.c (vectorizable_shift): If SLP, determine
> whether the shift is by scalar or vector based on whether all SLP
> scalar stmts have the same rhs.
>
> * gcc.dg/pr48616.c: New test.
>
> --- gcc/tree-vect-stmts.c.jj 2011-03-14 14:12:15.000000000 +0100
> +++ gcc/tree-vect-stmts.c 2011-04-15 20:44:30.000000000 +0200
> @@ -2077,7 +2077,7 @@ vectorizable_shift (gimple stmt, gimple_
> VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
> tree vop0, vop1;
> unsigned int k;
> - bool scalar_shift_arg = false;
> + bool scalar_shift_arg = true;
> bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
> int vf;
>
> @@ -2159,8 +2159,34 @@ vectorizable_shift (gimple stmt, gimple_
> /* Determine whether the shift amount is a vector, or scalar. If the
> shift/rotate amount is a vector, use the vector/vector shift
optabs. */
>
> + if (dt[1] == vect_internal_def && !slp_node)
> + scalar_shift_arg = false;
> + else if (dt[1] == vect_constant_def
> + || dt[1] == vect_external_def
> + || dt[1] == vect_internal_def)
> + {
> + /* In SLP, need to check whether the shift count is the same,
> + in loops if it is a constant or invariant, it is always
> + a scalar shift. */
> + if (slp_node)
> + {
> + VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
> + gimple slpstmt;
> +
> + FOR_EACH_VEC_ELT (gimple, stmts, k, slpstmt)
> + if (!operand_equal_p (gimple_assign_rhs2 (slpstmt), op1, 0))
> + scalar_shift_arg = false;
We already have this check in vect_build_slp_tree(). It didn't work for the
testcase because it doesn't take into account the type of definition. I
agree that it's better to move it here and base the vector/scalar decision
on it, but please remove the now redundant check from vect_build_slp_tree
().
Otherwise, OK for trunk.
Thanks,
Ira
> + }
> + }
> + else
> + {
> + if (vect_print_dump_info (REPORT_DETAILS))
> + fprintf (vect_dump, "operand mode requires invariant argument.");
> + return false;
> + }
> +
> /* Vector shifted by vector. */
> - if (dt[1] == vect_internal_def)
> + if (!scalar_shift_arg)
> {
> optab = optab_for_tree_code (code, vectype, optab_vector);
> if (vect_print_dump_info (REPORT_DETAILS))
> @@ -2168,13 +2194,12 @@ vectorizable_shift (gimple stmt, gimple_
> }
> /* See if the machine has a vector shifted by scalar insn and if not
> then see if it has a vector shifted by vector insn. */
> - else if (dt[1] == vect_constant_def || dt[1] == vect_external_def)
> + else
> {
> optab = optab_for_tree_code (code, vectype, optab_scalar);
> if (optab
> && optab_handler (optab, TYPE_MODE (vectype)) !=
CODE_FOR_nothing)
> {
> - scalar_shift_arg = true;
> if (vect_print_dump_info (REPORT_DETAILS))
> fprintf (vect_dump, "vector/scalar shift/rotate found.");
> }
> @@ -2185,6 +2210,8 @@ vectorizable_shift (gimple stmt, gimple_
> && (optab_handler (optab, TYPE_MODE (vectype))
> != CODE_FOR_nothing))
> {
> + scalar_shift_arg = false;
> +
> if (vect_print_dump_info (REPORT_DETAILS))
> fprintf (vect_dump, "vector/vector shift/rotate
found.");
>
> @@ -2197,12 +2224,6 @@ vectorizable_shift (gimple stmt, gimple_
> }
> }
> }
> - else
> - {
> - if (vect_print_dump_info (REPORT_DETAILS))
> - fprintf (vect_dump, "operand mode requires invariant
argument.");
> - return false;
> - }
>
> /* Supportable by target? */
> if (!optab)
> --- gcc/testsuite/gcc.dg/pr48616.c.jj 2011-04-15 21:02:27.000000000
+0200
> +++ gcc/testsuite/gcc.dg/pr48616.c 2011-04-15 21:00:24.000000000 +0200
> @@ -0,0 +1,134 @@
> +/* PR tree-optimization/48616 */
> +/* { dg-do run } */
> +/* { dg-options "-O2 -ftree-vectorize" } */
> +
> +extern void abort (void);
> +int a[4] __attribute__((aligned (32)));
> +int b[4] __attribute__((aligned (32)));
> +int c[4] __attribute__((aligned (32)));
> +int d[4] __attribute__((aligned (32)));
> +int e[4] __attribute__((aligned (32)));
> +
> +__attribute__((noinline, noclone))
> +int
> +foo (int x)
> +{
> + asm ("" : "+r" (x));
> + return x;
> +}
> +
> +__attribute__((noinline, noclone))
> +void
> +fn1 (int i)
> +{
> + a[0] = b[0] << c[0];
> + a[1] = b[1] << c[1];
> + a[2] = b[2] << c[2];
> + a[3] = b[3] << c[3];
> + if (i)
> + {
> + d[0] = e[0] >> c[0];
> + d[1] = e[1] >> c[1];
> + d[2] = e[2] >> c[2];
> + d[3] = e[3] >> c[3];
> + }
> +}
> +
> +__attribute__((noinline, noclone))
> +void
> +fn2 (int i)
> +{
> + a[0] = b[0] << 1;
> + a[1] = b[1] << 2;
> + a[2] = b[2] << 3;
> + a[3] = b[3] << 4;
> + if (i)
> + {
> + d[0] = e[0] >> 1;
> + d[1] = e[1] >> 2;
> + d[2] = e[2] >> 3;
> + d[3] = e[3] >> 4;
> + }
> +}
> +
> +__attribute__((noinline, noclone))
> +void
> +fn3 (int i, int j)
> +{
> + int x = foo (j);
> + a[0] = b[0] << x;
> + a[1] = b[1] << x;
> + a[2] = b[2] << x;
> + a[3] = b[3] << x;
> + if (i)
> + {
> + d[0] = e[0] >> x;
> + d[1] = e[1] >> x;
> + d[2] = e[2] >> x;
> + d[3] = e[3] >> x;
> + }
> +}
> +
> +__attribute__((noinline, noclone))
> +void
> +fn4 (int i)
> +{
> + a[0] = b[0] << 1;
> + a[1] = b[1] << 1;
> + a[2] = b[2] << 1;
> + a[3] = b[3] << 1;
> + if (i)
> + {
> + d[0] = e[0] >> 1;
> + d[1] = e[1] >> 1;
> + d[2] = e[2] >> 1;
> + d[3] = e[3] >> 1;
> + }
> +}
> +
> +int
> +main ()
> +{
> + int i;
> + int *t;
> + for (i = 0; i < 4; i++)
> + {
> + b[i] = 32;
> + c[i] = i + 1;
> + e[i] = 32;
> + }
> + asm volatile ("" : : "r" (b) : "memory");
> + asm volatile ("" : : "r" (c) : "memory");
> + asm volatile ("" : "=r" (t) : "0" (d) : "memory");
> + fn1 (t != 0);
> + for (i = 0; i < 4; i++)
> + {
> + if (a[i] != (32 << (i + 1)) || d[i] != (32 >> (i + 1)))
> + abort ();
> + a[i] = 0;
> + d[i] = 0;
> + }
> + fn2 (t != 0);
> + for (i = 0; i < 4; i++)
> + {
> + if (a[i] != (32 << (i + 1)) || d[i] != (32 >> (i + 1)))
> + abort ();
> + a[i] = 0;
> + d[i] = 0;
> + }
> + fn3 (t != 0, t != 0);
> + for (i = 0; i < 4; i++)
> + {
> + if (a[i] != (32 << 1) || d[i] != (32 >> 1))
> + abort ();
> + a[i] = 0;
> + d[i] = 0;
> + }
> + fn4 (t != 0);
> + for (i = 0; i < 4; i++)
> + {
> + if (a[i] != (32 << 1) || d[i] != (32 >> 1))
> + abort ();
> + }
> + return 0;
> +}
>
> Jakub