On Tue, Jan 18, 2022 at 7:26 AM liuhongt <hongtao....@intel.com> wrote:
>
> For testcase in PR, the patch supports QI:4 -> HI:16 pack with
> multi steps(first pack QI:4 -> QI:8 through vec_pack_sbool_trunc_qi,
> then pack QI:8 -> HI:16 through vec_pack_trunc_hi).
> Similar for QI:2 -> HI:16 which is test4 in mask-pack-prefer-128.c.
>
> Bootstrapped both with and w/o '--with-arch=native --with-cpu=native'
> on CLX.
> Regtested for x86_64-pc-linux-gnu{-m32,} and
> x86_64-pc-linux-gnu{-m32\ -march=cascadelake,\ -march=cascadelake} on CLX.
>
> This patch can partially fix the regression in PR since it enable
> vectorization for epilogue.
> I'm also working on another patch to prevent mask pack for testcase in PR,
> and there will be a separate patch.
>
> Ok for trunk?
>
> gcc/ChangeLog:
>
>         PR target/103771
>         * tree-vect-stmts.c (supportable_narrowing_operation): Enhance
>         integral mode mask pack by multi steps which takes
>         vec_pack_sbool_trunc_optab as start when elements number is
>         less than BITS_PER_UNITS.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/mask-pack-prefer128.c: New test.
>         * gcc.target/i386/mask-pack-prefer128.c: New test.
>         * gcc.target/i386/pr103771.c: New test.
> ---
>  .../gcc.target/i386/mask-pack-prefer128.c      |  8 ++++++++
>  .../gcc.target/i386/mask-pack-prefer256.c      |  8 ++++++++
>  gcc/testsuite/gcc.target/i386/pr103771.c       | 18 ++++++++++++++++++
>  gcc/tree-vect-stmts.c                          | 11 ++++++++---
>  4 files changed, 42 insertions(+), 3 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/mask-pack-prefer128.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/mask-pack-prefer256.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr103771.c
>
> diff --git a/gcc/testsuite/gcc.target/i386/mask-pack-prefer128.c 
> b/gcc/testsuite/gcc.target/i386/mask-pack-prefer128.c
> new file mode 100644
> index 00000000000..c9ea37c7ed3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/mask-pack-prefer128.c
> @@ -0,0 +1,8 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=skylake-avx512 -O3 -fopenmp-simd 
> -fdump-tree-vect-details -mprefer-vector-width=128" } */
> +/* Disabling epilogues until we find a better way to deal with scans.  */
> +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 10 "vect" } } */
> +/* { dg-final { scan-assembler-not "maskmov" } } */
> +
> +#include "mask-pack.c"
> diff --git a/gcc/testsuite/gcc.target/i386/mask-pack-prefer256.c 
> b/gcc/testsuite/gcc.target/i386/mask-pack-prefer256.c
> new file mode 100644
> index 00000000000..841f51b4041
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/mask-pack-prefer256.c
> @@ -0,0 +1,8 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=skylake-avx512 -O3 -fopenmp-simd 
> -fdump-tree-vect-details -mprefer-vector-width=256" } */
> +/* Disabling epilogues until we find a better way to deal with scans.  */
> +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 10 "vect" } } */
> +/* { dg-final { scan-assembler-not "maskmov" } } */
> +
> +#include "mask-pack.c"
> diff --git a/gcc/testsuite/gcc.target/i386/pr103771.c 
> b/gcc/testsuite/gcc.target/i386/pr103771.c
> new file mode 100644
> index 00000000000..a1a9952b6a8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr103771.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=cascadelake -O3 -fdump-tree-vect-details 
> -mprefer-vector-width=128" } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
> +
> +typedef unsigned char uint8_t;
> +
> +static uint8_t x264_clip_uint8 (int x)
> +{
> +  return x & (~255) ? (-x) >> 31 : x;
> +}
> +
> +void
> +mc_weight (uint8_t* __restrict dst, uint8_t* __restrict src,
> +          int i_width,int i_scale)
> +{
> +  for(int x = 0; x < i_width; x++)
> +    dst[x] = x264_clip_uint8 (src[x] * i_scale);
> +}
> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index f2625a2ff40..4baf5e36127 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -12111,6 +12111,7 @@ supportable_narrowing_operation (enum tree_code code,
>    tree intermediate_type, prev_type;
>    machine_mode intermediate_mode, prev_mode;
>    int i;
> +  unsigned HOST_WIDE_INT n_elts;
>    bool uns;
>
>    *multi_step_cvt = 0;
> @@ -12120,8 +12121,10 @@ supportable_narrowing_operation (enum tree_code code,
>        c1 = VEC_PACK_TRUNC_EXPR;
>        if (VECTOR_BOOLEAN_TYPE_P (narrow_vectype)
>           && VECTOR_BOOLEAN_TYPE_P (vectype)
> -         && TYPE_MODE (narrow_vectype) == TYPE_MODE (vectype)
> -         && SCALAR_INT_MODE_P (TYPE_MODE (vectype)))
> +         && SCALAR_INT_MODE_P (TYPE_MODE (vectype))
> +         && (TYPE_MODE (narrow_vectype) == TYPE_MODE (vectype)
> +             || (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&n_elts)
> +                 && n_elts < BITS_PER_UNIT)))

your description above hints at that the actual modes involved in the
vec_pack_sbool_trunc are the same so the TYPE_MODE (narrow_vectype)
and TYPE_MODE (vectype) are not the actual modes participating.  I think
it would be way better to fix that.

I suppose that since we know TYPE_VECTOR_SUBPARTS is a power of two
it's always going to be only QImode that is of interest here so maybe a better
check would be TYPE_MODE (narrow_vectype) == QImode rather than
the equality check or elide the mode check completely and only retain
the TYPE_VECTOR_SUBPARTS check you add?

>         optab1 = vec_pack_sbool_trunc_optab;
>        else
>         optab1 = optab_for_tree_code (c1, vectype, optab_default);
> @@ -12213,7 +12216,9 @@ supportable_narrowing_operation (enum tree_code code,
>        if (VECTOR_BOOLEAN_TYPE_P (intermediate_type)
>           && VECTOR_BOOLEAN_TYPE_P (prev_type)
>           && intermediate_mode == prev_mode

Likewise here.

So I think the change is OK if you remove the mode equality checks.

Thanks,
Richard.

> -         && SCALAR_INT_MODE_P (prev_mode))
> +         && SCALAR_INT_MODE_P (prev_mode)
> +         && (TYPE_VECTOR_SUBPARTS (intermediate_type).is_constant (&n_elts)
> +             && n_elts < BITS_PER_UNIT))
>         interm_optab = vec_pack_sbool_trunc_optab;
>        else
>         interm_optab
> --
> 2.18.1
>

Reply via email to