On Fri, Aug 8, 2025 at 3:23 AM Richard Biener <richard.guent...@gmail.com> wrote: > > On Wed, Aug 6, 2025 at 7:04 AM Edwin Lu <e...@rivosinc.com> wrote: > > > > This patch tries to add support for a variant of SAT_TRUNC where > > negative numbers are clipped to 0 instead of NARROW_TYPE_MAX_VALUE. > > This form is seen in x264, aka > > > > UT clip (T a) > > { > > return a & (UT)(-1) ? (-a) >> 31 : a; > > the patch below says > > > + /* SAT_U_TRUNC = (UT)X & (NT)(-1) ? (-X) >> 31 : X > > so I'm not sure what this is about? Is an actual example > > unsigned char clip (unsigned int a) > { > return a & (unsigned char)-1 ? -a >> 31 : a; > } > Sorry if I made it unclear in the patch. I got confused about the notation when updating the match.pd file and once I got what I wanted working, I didn't update the comment. I was doing a lot of guessing/experimenting when updating the file so my knowledge is probably flawed.
The example I was trying to match is this: inline uint8_t x264_clip_uint8 (int x) { return x & (~255) ? (-x) >> 31 : x; } I believe the correct notation would be SAT_U_TRUNC = X & ~(NT)(-1) ? (-X) >> TYPE_PRECISION (X) - 1 : X to account for x being a non 32 bit value. > ? But then I'm confused about the hard-coded 31, -X >> 31 should > be the inverted sign-bit of X? > The hard-coded 31 comes from the example. I'll update the patch to have it reflect the input size properly, but yes, -X >> 31 should just populate the value with the inverted sign-bit of X > The pattern restricts this to signed X it seems and the result is also > signed. So what is this supposed to be? A form of MAX(0, x) for > signed x? > The result should be unsigned so am I missing something in the pattern to signify unsigned? My goal is if the pattern is matched, it should emit a = SMAX(0, x) b = SAT_TRUNC(a) > > } > > > > Where sizeof(UT) < sizeof(T) > > > > I'm unable to get the SAT_TRUNC pattern to appear on x86_64, however it > > does appear when building for riscv as seen below: > > > > Before this patch: > > <bb 3> [local count: 764504183]: > > # i_21 = PHI <i_14(8), 0(15)> > > # vectp_x.10_54 = PHI <vectp_x.10_55(8), x_10(D)(15)> > > # vectp_res.20_66 = PHI <vectp_res.20_67(8), res_11(D)(15)> > > # ivtmp_70 = PHI <ivtmp_71(8), _69(15)> > > _72 = .SELECT_VL (ivtmp_70, POLY_INT_CST [4, 4]); > > _1 = (long unsigned int) i_21; > > _2 = _1 * 4; > > _3 = x_10(D) + _2; > > ivtmp_53 = _72 * 4; > > vect__4.12_57 = .MASK_LEN_LOAD (vectp_x.10_54, 32B, { -1, ... }, _56(D), > > _72, 0); > > vect_x.13_58 = VIEW_CONVERT_EXPR<vector([4,4]) unsigned > > int>(vect__4.12_57); > > vect__38.15_60 = -vect_x.13_58; > > vect__15.16_61 = VIEW_CONVERT_EXPR<vector([4,4]) int>(vect__38.15_60); > > vect__16.17_62 = vect__15.16_61 >> 31; > > mask__29.14_59 = vect_x.13_58 > { 255, ... }; > > vect__17.18_63 = VEC_COND_EXPR <mask__29.14_59, vect__16.17_62, > > vect__4.12_57>; > > vect__18.19_64 = (vector([4,4]) unsigned char) vect__17.18_63; > > _4 = *_3; > > _5 = res_11(D) + _1; > > x.0_12 = (unsigned int) _4; > > _38 = -x.0_12; > > _15 = (int) _38; > > _16 = _15 >> 31; > > _29 = x.0_12 > 255; > > _17 = _29 ? _16 : _4; > > _18 = (unsigned char) _17; > > .MASK_LEN_STORE (vectp_res.20_66, 8B, { -1, ... }, _72, 0, > > vect__18.19_64); > > i_14 = i_21 + 1; > > vectp_x.10_55 = vectp_x.10_54 + ivtmp_53; > > vectp_res.20_67 = vectp_res.20_66 + _72; > > ivtmp_71 = ivtmp_70 - _72; > > if (ivtmp_71 != 0) > > goto <bb 8>; [89.00%] > > else > > goto <bb 17>; [11.00%] > > > > After this patch: > > > > <bb 3> [local count: 764504183]: > > # i_21 = PHI <i_14(8), 0(15)> > > # vectp_x.10_68 = PHI <vectp_x.10_69(8), x_10(D)(15)> > > # vectp_res.15_75 = PHI <vectp_res.15_76(8), res_11(D)(15)> > > # ivtmp_79 = PHI <ivtmp_80(8), _78(15)> > > _81 = .SELECT_VL (ivtmp_79, POLY_INT_CST [4, 4]); > > _1 = (long unsigned int) i_21; > > _2 = _1 * 4; > > _3 = x_10(D) + _2; > > ivtmp_67 = _81 * 4; > > vect__4.12_71 = .MASK_LEN_LOAD (vectp_x.10_68, 32B, { -1, ... }, _70(D), > > _81, 0); > > vect_patt_37.13_72 = MAX_EXPR <{ 0, ... }, vect__4.12_71>; > > vect_patt_39.14_73 = .SAT_TRUNC (vect_patt_37.13_72); > > _4 = *_3; > > _5 = res_11(D) + _1; > > x.0_12 = (unsigned int) _4; > > _38 = -x.0_12; > > _15 = (int) _38; > > _16 = _15 >> 31; > > _29 = x.0_12 > 255; > > _17 = _29 ? _16 : _4; > > _18 = (unsigned char) _17; > > .MASK_LEN_STORE (vectp_res.15_75, 8B, { -1, ... }, _81, 0, > > vect_patt_39.14_73); > > i_14 = i_21 + 1; > > vectp_x.10_69 = vectp_x.10_68 + ivtmp_67; > > vectp_res.15_76 = vectp_res.15_75 + _81; > > ivtmp_80 = ivtmp_79 - _81; > > if (ivtmp_80 != 0) > > goto <bb 8>; [89.00%] > > else > > goto <bb 17>; [11.00%] > > > > gcc/ChangeLog: > > > > * match.pd: New NARROW_CLIP variant for SAT_TRUNC. > > * tree-vect-patterns.cc (gimple_unsigned_integer_narrow_clip): > > Add new decl for NARROW_CLIP. > > (vect_recog_sat_trunc_pattern): Add NARROW_CLIP check. > > > > Signed-off-by: Edwin Lu <e...@rivosinc.com> > > --- > > gcc/match.pd | 23 +++++++++++++++++++++++ > > gcc/tree-vect-patterns.cc | 32 ++++++++++++++++++++++++++++++++ > > 2 files changed, 55 insertions(+) > > > > diff --git a/gcc/match.pd b/gcc/match.pd > > index 82e6e291ae1..546e8fb3685 100644 > > --- a/gcc/match.pd > > +++ b/gcc/match.pd > > @@ -3360,6 +3360,29 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > } > > (if (wi::eq_p (sum, wi::uhwi (0, precision)))))))) > > > > +(if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)) > > + (match (unsigned_integer_narrow_clip @0) > > + /* SAT_U_TRUNC = (UT)X & (NT)(-1) ? (-X) >> 31 : X > > + > > + The gimple representation uses X > (NT)(-1) instead of > > + using & so match on gt instead of bit_and. */ > > + (convert (cond^ (gt (nop_convert? @0) INTEGER_CST@1) > > + (rshift:s (nop_convert? (negate (nop_convert? @0))) INTEGER_CST@2) > > + @0)) > > + (if (! TYPE_UNSIGNED (TREE_TYPE (@0))) > > + (with > > + { > > + unsigned itype_precision = TYPE_PRECISION (TREE_TYPE (@0)); > > + unsigned otype_precision = TYPE_PRECISION (type); > > + wide_int trunc_max = wi::mask (otype_precision, false, > > itype_precision); > > + wide_int int_cst_1 = wi::to_wide (@1, itype_precision); > > + wide_int int_cst_2 = wi::to_wide (@2, itype_precision); > > + wide_int shift_amount = wi::uhwi ((HOST_WIDE_INT_1U << 5) - 1, > > + itype_precision); // Aka 31 > > + } > > + (if (otype_precision < itype_precision && wi::eq_p (trunc_max, > > + int_cst_1) && wi::eq_p(int_cst_2, shift_amount))))))) > > + > > /* Saturation truncate for unsigned integer. */ > > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)) > > (match (unsigned_integer_sat_trunc @0) > > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc > > index ffb320fbf23..a6588950be4 100644 > > --- a/gcc/tree-vect-patterns.cc > > +++ b/gcc/tree-vect-patterns.cc > > @@ -4505,6 +4505,8 @@ extern bool gimple_unsigned_integer_sat_add (tree, > > tree*, tree (*)(tree)); > > extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree)); > > extern bool gimple_unsigned_integer_sat_trunc (tree, tree*, tree > > (*)(tree)); > > > > +extern bool gimple_unsigned_integer_narrow_clip (tree, tree*, tree > > (*)(tree)); > > + > > extern bool gimple_signed_integer_sat_add (tree, tree*, tree (*)(tree)); > > extern bool gimple_signed_integer_sat_sub (tree, tree*, tree (*)(tree)); > > extern bool gimple_signed_integer_sat_trunc (tree, tree*, tree (*)(tree)); > > @@ -4739,6 +4741,36 @@ vect_recog_sat_trunc_pattern (vec_info *vinfo, > > stmt_vec_info stmt_vinfo, > > tree lhs = gimple_assign_lhs (last_stmt); > > tree otype = TREE_TYPE (lhs); > > > > + if ((gimple_unsigned_integer_narrow_clip (lhs, ops, NULL)) > > + && type_has_mode_precision_p (otype)) > > + { > > + tree itype = TREE_TYPE (ops[0]); > > + tree v_itype = get_vectype_for_scalar_type (vinfo, itype); > > + tree v_otype = get_vectype_for_scalar_type (vinfo, otype); > > + internal_fn fn = IFN_SAT_TRUNC; > > + > > + if (v_itype != NULL_TREE && v_otype != NULL_TREE > > + && direct_internal_fn_supported_p (fn, tree_pair (v_otype, v_itype), > > + OPTIMIZE_FOR_BOTH)) > > + { > > + tree temp = vect_recog_temp_ssa_var (itype, NULL); > > + gimple * max_stmt = gimple_build_assign (temp, build2 (MAX_EXPR, > > itype, build_zero_cst(itype), ops[0])); > > + append_pattern_def_seq (vinfo, stmt_vinfo, max_stmt, v_itype); > > + > > + gcall *call = gimple_build_call_internal (fn, 1, temp); > > + tree out_ssa = vect_recog_temp_ssa_var (otype, NULL); > > + > > + gimple_call_set_lhs (call, out_ssa); > > + gimple_call_set_nothrow (call, /* nothrow_p */ false); > > + gimple_set_location (call, gimple_location (last_stmt)); > > + > > + *type_out = v_otype; > > + > > + return call; > > + } > > + > > + } > > + > > if ((gimple_unsigned_integer_sat_trunc (lhs, ops, NULL) > > || gimple_signed_integer_sat_trunc (lhs, ops, NULL)) > > && type_has_mode_precision_p (otype)) > > -- > > 2.43.0 > >