This patch adds support for matching a saturating truncate on a signed input variable, however, negative values are clipped to 0 instead of NT_MAX. This pattern is seen in x264. We change the prior loop codegen from
vsetvli a5,a2,e32,m1,ta,mu vle32.v v1,0(a1) slli a4,a5,2 sub a2,a2,a5 add a1,a1,a4 vmsgtu.vx v0,v1,a3 vrsub.vi v2,v1,0 vsra.vi v1,v2,31,v0.t vsetvli zero,zero,e16,mf2,ta,ma vnsrl.wi v1,v1,0 vsetvli zero,zero,e8,mf4,ta,ma vnsrl.wi v1,v1,0 vse8.v v1,0(a0) to vsetvli a5,a2,e32,m1,ta,ma vle32.v v1,0(a1) slli a4,a5,2 sub a2,a2,a5 add a1,a1,a4 vmax.vv v1,v1,v2 <-- v2 defined by `vmv.v.i v2,0` outside vsetvli zero,zero,e16,mf2,ta,ma vnclipu.wi v1,v1,0 vsetvli zero,zero,e8,mf4,ta,ma vnclipu.wi v1,v1,0 vse8.v v1,0(a0) which is closer to how clang vectorizes the pattern. PR target/120378 gcc/ChangeLog: * match.pd: Add narrow clip pattern. * tree-vect-patterns.cc (gimple_unsigned_integer_narrow_clip): Add function declaration. (vect_recog_sat_trunc_pattern): Perform narrow clip check before regular sat trunc check. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/pr120378.c: New test. Signed-off-by: Edwin Lu <e...@rivosinc.com> --- gcc/match.pd | 23 +++++++++++++ .../gcc.target/riscv/rvv/autovec/pr120378.c | 21 ++++++++++++ gcc/tree-vect-patterns.cc | 32 +++++++++++++++++++ 3 files changed, 76 insertions(+) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr120378.c diff --git a/gcc/match.pd b/gcc/match.pd index 82e6e291ae1..546e8fb3685 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -3360,6 +3360,29 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) } (if (wi::eq_p (sum, wi::uhwi (0, precision)))))))) +(if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)) + (match (unsigned_integer_narrow_clip @0) + /* SAT_U_TRUNC = (UT)X & (NT)(-1) ? (-X) >> 31 : X + + The gimple representation uses X > (NT)(-1) instead of + using & so match on gt instead of bit_and. */ + (convert (cond^ (gt (nop_convert? @0) INTEGER_CST@1) + (rshift:s (nop_convert? (negate (nop_convert? @0))) INTEGER_CST@2) + @0)) + (if (! TYPE_UNSIGNED (TREE_TYPE (@0))) + (with + { + unsigned itype_precision = TYPE_PRECISION (TREE_TYPE (@0)); + unsigned otype_precision = TYPE_PRECISION (type); + wide_int trunc_max = wi::mask (otype_precision, false, itype_precision); + wide_int int_cst_1 = wi::to_wide (@1, itype_precision); + wide_int int_cst_2 = wi::to_wide (@2, itype_precision); + wide_int shift_amount = wi::uhwi ((HOST_WIDE_INT_1U << 5) - 1, + itype_precision); // Aka 31 + } + (if (otype_precision < itype_precision && wi::eq_p (trunc_max, + int_cst_1) && wi::eq_p(int_cst_2, shift_amount))))))) + /* Saturation truncate for unsigned integer. */ (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)) (match (unsigned_integer_sat_trunc @0) diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr120378.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr120378.c new file mode 100644 index 00000000000..500028e7a15 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr120378.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */ + +#include <stdint.h> + +inline uint8_t +clip_uint8 (int x) +{ + return x & (~255) ? (-x) >> 31 : x; +} + +void __attribute__ ((noipa)) +clip_loop (uint8_t *res, int *x, int w) +{ + for (int i = 0; i < w; i++) + res[i] = clip_uint8 (x[i]); +} + +/* { dg-final { scan-tree-dump-times ".SAT_TRUNC " 1 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "MAX_EXPR " 1 "optimized" } } */ +/* { dg-final { scan-assembler-times {vnclipu\.wi} 2 } } */ diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc index ffb320fbf23..a6588950be4 100644 --- a/gcc/tree-vect-patterns.cc +++ b/gcc/tree-vect-patterns.cc @@ -4505,6 +4505,8 @@ extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree)); extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree)); extern bool gimple_unsigned_integer_sat_trunc (tree, tree*, tree (*)(tree)); +extern bool gimple_unsigned_integer_narrow_clip (tree, tree*, tree (*)(tree)); + extern bool gimple_signed_integer_sat_add (tree, tree*, tree (*)(tree)); extern bool gimple_signed_integer_sat_sub (tree, tree*, tree (*)(tree)); extern bool gimple_signed_integer_sat_trunc (tree, tree*, tree (*)(tree)); @@ -4739,6 +4741,36 @@ vect_recog_sat_trunc_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo, tree lhs = gimple_assign_lhs (last_stmt); tree otype = TREE_TYPE (lhs); + if ((gimple_unsigned_integer_narrow_clip (lhs, ops, NULL)) + && type_has_mode_precision_p (otype)) + { + tree itype = TREE_TYPE (ops[0]); + tree v_itype = get_vectype_for_scalar_type (vinfo, itype); + tree v_otype = get_vectype_for_scalar_type (vinfo, otype); + internal_fn fn = IFN_SAT_TRUNC; + + if (v_itype != NULL_TREE && v_otype != NULL_TREE + && direct_internal_fn_supported_p (fn, tree_pair (v_otype, v_itype), + OPTIMIZE_FOR_BOTH)) + { + tree temp = vect_recog_temp_ssa_var (itype, NULL); + gimple * max_stmt = gimple_build_assign (temp, build2 (MAX_EXPR, itype, build_zero_cst(itype), ops[0])); + append_pattern_def_seq (vinfo, stmt_vinfo, max_stmt, v_itype); + + gcall *call = gimple_build_call_internal (fn, 1, temp); + tree out_ssa = vect_recog_temp_ssa_var (otype, NULL); + + gimple_call_set_lhs (call, out_ssa); + gimple_call_set_nothrow (call, /* nothrow_p */ false); + gimple_set_location (call, gimple_location (last_stmt)); + + *type_out = v_otype; + + return call; + } + + } + if ((gimple_unsigned_integer_sat_trunc (lhs, ops, NULL) || gimple_signed_integer_sat_trunc (lhs, ops, NULL)) && type_has_mode_precision_p (otype)) -- 2.43.0