On Fri, 17 Nov 2023, Jakub Jelinek wrote: > Hi! > > ctz(ext(X)) is the same as ctz(X) in the UB on zero case (or could be also > in the 2 argument case on large BITINT_TYPE by preserving the argument, not > implemented in this patch), > popcount(zext(X)) is the same as popcount(X), > parity(zext(X)) is the same as parity(X), > parity(sext(X)) is the same as parity(X) provided the bit difference between > the extended and unextended types is even, > ffs(ext(X)) is the same as ffs(X). > > The following patch optimizes those in match.pd if those are beneficial > (always in the large BITINT_TYPE case, or if the narrower type has optab > and the wider doesn't, or the wider is larger than word and narrower is > one of the standard argument sizes (tested just int and long long, as > long is on most targets same bitsize as one of those two). > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
OK. > Joseph in the PR mentioned that ctz(narrow(X)) is the same as ctz(X) > if UB on 0, but that can be handled incrementally (and would need different > decisions when it is profitable). > And clz(zext(X)) is clz(X) + bit_difference, but not sure we want to change > that in match.pd at all, perhaps during insn selection? Yeah, it would be less canonical for GIMPLE I think. Richard. > 2023-11-17 Jakub Jelinek <ja...@redhat.com> > > PR tree-optimization/112566 > PR tree-optimization/83171 > * match.pd (ctz(ext(X)) -> ctz(X), popcount(zext(X)) -> popcount(X), > parity(ext(X)) -> parity(X), ffs(ext(X)) -> ffs(X)): New > simplifications. > ( __builtin_ffs (X) == 0 -> X == 0): Use FFS rather than > BUILT_IN_FFS BUILT_IN_FFSL BUILT_IN_FFSLL BUILT_IN_FFSIMAX. > > * gcc.dg/pr112566-1.c: New test. > * gcc.dg/pr112566-2.c: New test. > * gcc.target/i386/pr78057.c (foo): Pass another long long argument > and use it in __builtin_ia32_*zcnt_u64 instead of the int one. > > --- gcc/match.pd.jj 2023-11-14 10:52:16.190276041 +0100 > +++ gcc/match.pd 2023-11-16 13:09:41.221275374 +0100 > @@ -8672,6 +8672,50 @@ (define_operator_list SYNC_FETCH_AND_AND > wi::shifted_mask (tree_to_uhwi (@1), 1, > false, prec)); }))))))) > > +#if GIMPLE > +/* ctz(ext(X)) == ctz(X). Valid just for the UB at zero cases though. */ > +(simplify > + (CTZ (convert@1 @0)) > + (if (INTEGRAL_TYPE_P (TREE_TYPE (@1)) > + && INTEGRAL_TYPE_P (TREE_TYPE (@0)) > + && TYPE_PRECISION (TREE_TYPE (@1)) > TYPE_PRECISION (TREE_TYPE (@0))) > + (with { combined_fn cfn = CFN_LAST; > + tree type0 = TREE_TYPE (@0); > + if (TREE_CODE (type0) == BITINT_TYPE) > + { > + if (TYPE_PRECISION (type0) > MAX_FIXED_MODE_SIZE) > + cfn = CFN_CTZ; > + else > + type0 > + = build_nonstandard_integer_type (TYPE_PRECISION (type0), > + 1); > + } > + type0 = unsigned_type_for (type0); > + if (cfn == CFN_LAST > + && direct_internal_fn_supported_p (IFN_CTZ, type0, > + OPTIMIZE_FOR_BOTH)) > + cfn = CFN_CTZ; > + if (cfn == CFN_LAST > + && TYPE_PRECISION (TREE_TYPE (@1)) > BITS_PER_WORD > + && !direct_internal_fn_supported_p (IFN_CTZ, > + TREE_TYPE (@1), > + OPTIMIZE_FOR_BOTH)) > + { > + if (TYPE_PRECISION (type0) > + == TYPE_PRECISION (unsigned_type_node)) > + cfn = CFN_BUILT_IN_CTZ; > + else if (TYPE_PRECISION (type0) > + == TYPE_PRECISION (long_long_unsigned_type_node)) > + cfn = CFN_BUILT_IN_CTZLL; > + } } > + (if (cfn == CFN_CTZ) > + (IFN_CTZ (convert:type0 @0)) > + (if (cfn == CFN_BUILT_IN_CTZ) > + (BUILT_IN_CTZ (convert:type0 @0)) > + (if (cfn == CFN_BUILT_IN_CTZLL) > + (BUILT_IN_CTZLL (convert:type0 @0)))))))) > +#endif > + > /* POPCOUNT simplifications. */ > /* popcount(X) + popcount(Y) is popcount(X|Y) when X&Y must be zero. */ > (simplify > @@ -8742,6 +8786,50 @@ (define_operator_list SYNC_FETCH_AND_AND > (popcount:s @1)) > (popcount (log2 @0 @1))))) > > +#if GIMPLE > +/* popcount(zext(X)) == popcount(X). */ > +(simplify > + (POPCOUNT (convert@1 @0)) > + (if (INTEGRAL_TYPE_P (TREE_TYPE (@1)) > + && INTEGRAL_TYPE_P (TREE_TYPE (@0)) > + && TYPE_UNSIGNED (TREE_TYPE (@0)) > + && TYPE_PRECISION (TREE_TYPE (@1)) > TYPE_PRECISION (TREE_TYPE (@0))) > + (with { combined_fn cfn = CFN_LAST; > + tree type0 = TREE_TYPE (@0); > + if (TREE_CODE (type0) == BITINT_TYPE) > + { > + if (TYPE_PRECISION (type0) > MAX_FIXED_MODE_SIZE) > + cfn = CFN_POPCOUNT; > + else > + type0 > + = build_nonstandard_integer_type (TYPE_PRECISION (type0), > + 1); > + } > + if (cfn == CFN_LAST > + && direct_internal_fn_supported_p (IFN_POPCOUNT, type0, > + OPTIMIZE_FOR_BOTH)) > + cfn = CFN_POPCOUNT; > + if (cfn == CFN_LAST > + && TYPE_PRECISION (TREE_TYPE (@1)) > BITS_PER_WORD > + && !direct_internal_fn_supported_p (IFN_POPCOUNT, > + TREE_TYPE (@1), > + OPTIMIZE_FOR_BOTH)) > + { > + if (TYPE_PRECISION (type0) > + == TYPE_PRECISION (unsigned_type_node)) > + cfn = CFN_BUILT_IN_POPCOUNT; > + else if (TYPE_PRECISION (type0) > + == TYPE_PRECISION (long_long_unsigned_type_node)) > + cfn = CFN_BUILT_IN_POPCOUNTLL; > + } } > + (if (cfn == CFN_POPCOUNT) > + (IFN_POPCOUNT (convert:type0 @0)) > + (if (cfn == CFN_BUILT_IN_POPCOUNT) > + (BUILT_IN_POPCOUNT (convert:type0 @0)) > + (if (cfn == CFN_BUILT_IN_POPCOUNTLL) > + (BUILT_IN_POPCOUNTLL (convert:type0 @0)))))))) > +#endif > + > /* PARITY simplifications. */ > /* parity(~X) is parity(X). */ > (simplify > @@ -8780,6 +8868,54 @@ (define_operator_list SYNC_FETCH_AND_AND > (bit_xor (PARITY:s @0) (PARITY:s @1)) > (PARITY (bit_xor @0 @1))) > > +#if GIMPLE > +/* parity(zext(X)) == parity(X). */ > +/* parity(sext(X)) == parity(X) if the difference in precision is even. */ > +(simplify > + (PARITY (convert@1 @0)) > + (if (INTEGRAL_TYPE_P (TREE_TYPE (@1)) > + && INTEGRAL_TYPE_P (TREE_TYPE (@0)) > + && TYPE_PRECISION (TREE_TYPE (@1)) > TYPE_PRECISION (TREE_TYPE (@0)) > + && (TYPE_UNSIGNED (TREE_TYPE (@0)) > + || ((TYPE_PRECISION (TREE_TYPE (@1)) > + - TYPE_PRECISION (TREE_TYPE (@0))) & 1) == 0)) > + (with { combined_fn cfn = CFN_LAST; > + tree type0 = TREE_TYPE (@0); > + if (TREE_CODE (type0) == BITINT_TYPE) > + { > + if (TYPE_PRECISION (type0) > MAX_FIXED_MODE_SIZE) > + cfn = CFN_PARITY; > + else > + type0 > + = build_nonstandard_integer_type (TYPE_PRECISION (type0), > + 1); > + } > + type0 = unsigned_type_for (type0); > + if (cfn == CFN_LAST > + && direct_internal_fn_supported_p (IFN_PARITY, type0, > + OPTIMIZE_FOR_BOTH)) > + cfn = CFN_PARITY; > + if (cfn == CFN_LAST > + && TYPE_PRECISION (TREE_TYPE (@1)) > BITS_PER_WORD > + && !direct_internal_fn_supported_p (IFN_PARITY, > + TREE_TYPE (@1), > + OPTIMIZE_FOR_BOTH)) > + { > + if (TYPE_PRECISION (type0) > + == TYPE_PRECISION (unsigned_type_node)) > + cfn = CFN_BUILT_IN_PARITY; > + else if (TYPE_PRECISION (type0) > + == TYPE_PRECISION (long_long_unsigned_type_node)) > + cfn = CFN_BUILT_IN_PARITYLL; > + } } > + (if (cfn == CFN_PARITY) > + (IFN_PARITY (convert:type0 @0)) > + (if (cfn == CFN_BUILT_IN_PARITY) > + (BUILT_IN_PARITY (convert:type0 @0)) > + (if (cfn == CFN_BUILT_IN_PARITYLL) > + (BUILT_IN_PARITYLL (convert:type0 @0)))))))) > +#endif > + > /* a != 0 ? FUN(a) : 0 -> Fun(a) for some builtin functions. */ > (for func (POPCOUNT BSWAP FFS PARITY) > (simplify > @@ -8987,8 +9123,7 @@ (define_operator_list SYNC_FETCH_AND_AND > (plus (CTZ:type (convert:utype @0)) { build_one_cst (type); })))) > #endif > > -(for ffs (BUILT_IN_FFS BUILT_IN_FFSL BUILT_IN_FFSLL > - BUILT_IN_FFSIMAX) > +(for ffs (FFS) > /* __builtin_ffs (X) == 0 -> X == 0. > __builtin_ffs (X) == 6 -> (X & 63) == 32. */ > (for cmp (eq ne) > @@ -9036,6 +9171,50 @@ (define_operator_list SYNC_FETCH_AND_AND > { build_zero_cst (TREE_TYPE (@0)); })))))))) > > #if GIMPLE > +/* ffs(ext(X)) == ffs(X). */ > +(simplify > + (FFS (convert@1 @0)) > + (if (INTEGRAL_TYPE_P (TREE_TYPE (@1)) > + && INTEGRAL_TYPE_P (TREE_TYPE (@0)) > + && TYPE_PRECISION (TREE_TYPE (@1)) > TYPE_PRECISION (TREE_TYPE (@0))) > + (with { combined_fn cfn = CFN_LAST; > + tree type0 = TREE_TYPE (@0); > + if (TREE_CODE (type0) == BITINT_TYPE) > + { > + if (TYPE_PRECISION (type0) > MAX_FIXED_MODE_SIZE) > + cfn = CFN_FFS; > + else > + type0 > + = build_nonstandard_integer_type (TYPE_PRECISION (type0), > + 0); > + } > + type0 = signed_type_for (type0); > + if (cfn == CFN_LAST > + && direct_internal_fn_supported_p (IFN_FFS, type0, > + OPTIMIZE_FOR_BOTH)) > + cfn = CFN_FFS; > + if (cfn == CFN_LAST > + && TYPE_PRECISION (TREE_TYPE (@1)) > BITS_PER_WORD > + && !direct_internal_fn_supported_p (IFN_FFS, > + TREE_TYPE (@1), > + OPTIMIZE_FOR_BOTH)) > + { > + if (TYPE_PRECISION (type0) > + == TYPE_PRECISION (integer_type_node)) > + cfn = CFN_BUILT_IN_FFS; > + else if (TYPE_PRECISION (type0) > + == TYPE_PRECISION (long_long_integer_type_node)) > + cfn = CFN_BUILT_IN_FFSLL; > + } } > + (if (cfn == CFN_FFS) > + (IFN_FFS (convert:type0 @0)) > + (if (cfn == CFN_BUILT_IN_FFS) > + (BUILT_IN_FFS (convert:type0 @0)) > + (if (cfn == CFN_BUILT_IN_FFSLL) > + (BUILT_IN_FFSLL (convert:type0 @0)))))))) > +#endif > + > +#if GIMPLE > > /* Simplify: > a = op a1 > --- gcc/testsuite/gcc.dg/pr112566-1.c.jj 2023-11-16 13:29:23.063761320 > +0100 > +++ gcc/testsuite/gcc.dg/pr112566-1.c 2023-11-16 13:35:14.459852714 +0100 > @@ -0,0 +1,14 @@ > +/* PR tree-optimization/112566 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > +/* { dg-additional-options "-mbmi2 -mlzcnt -mpopcnt" { target i?86-*-* > x86_64-*-* } } */ > +/* { dg-final { scan-tree-dump-not "ll \\\(" "optimized" { target ia32 } } } > */ > +/* { dg-final { scan-tree-dump-not "\\\(long long (unsigned )?int\\\)" > "optimized" { target ia32 } } } */ > + > +int foo (unsigned int x) { return __builtin_ctzll (x); } > +int bar (unsigned int x) { return __builtin_popcountll (x); } > +int baz (unsigned int x) { return __builtin_parityll (x); } > +int qux (int x) { return __builtin_ffsll (x); } > +int corge (int x) { return __builtin_ctzll (x); } > +int garply (int x) { return __builtin_parityll (x); } > +int fred (unsigned int x) { return __builtin_ffsll (x); } > --- gcc/testsuite/gcc.dg/pr112566-2.c.jj 2023-11-16 13:32:30.854138101 > +0100 > +++ gcc/testsuite/gcc.dg/pr112566-2.c 2023-11-16 13:34:14.727687104 +0100 > @@ -0,0 +1,12 @@ > +/* PR tree-optimization/112566 */ > +/* { dg-do compile { target bitint575 } } */ > +/* { dg-options "-O2 -fdump-tree-ccp2" } */ > +/* { dg-final { scan-tree-dump-not "\\\((unsigned )?_BitInt\\\(512\\\)\\\)" > "ccp2" } } */ > + > +int foo (unsigned _BitInt(256) x) { return __builtin_ctzg ((unsigned > _BitInt(512)) x); } > +int bar (unsigned _BitInt(256) x) { return __builtin_popcountg ((unsigned > _BitInt(512)) x); } > +int baz (unsigned _BitInt(256) x) { return __builtin_parityg ((unsigned > _BitInt(512)) x); } > +int qux (_BitInt(256) x) { return __builtin_ffsg ((_BitInt(512)) x); } > +int corge (_BitInt(256) x) { return __builtin_ctzg ((unsigned _BitInt(512)) > x); } > +int garply (_BitInt(256) x) { return __builtin_parityg ((unsigned > _BitInt(512)) x); } > +int fred (unsigned _BitInt(256) x) { return __builtin_ffsg ((_BitInt(512)) > x); } > --- gcc/testsuite/gcc.target/i386/pr78057.c.jj 2020-01-14 > 20:02:47.876593462 +0100 > +++ gcc/testsuite/gcc.target/i386/pr78057.c 2023-11-17 14:27:39.120666182 > +0100 > @@ -5,7 +5,7 @@ > extern void link_error (void); > > int > -foo (int x) > +foo (int x, long long y) > { > if (__builtin_ia32_tzcnt_u16 (16) != 4 > || __builtin_ia32_tzcnt_u16 (0) != 16 > @@ -24,13 +24,14 @@ foo (int x) > ) > link_error (); > x += 2; > - if (x == 0) > + y += 2; > + if (x == 0 || y == 0) > return 5; > return __builtin_ia32_tzcnt_u32 (x) > + __builtin_ia32_lzcnt_u32 (x) > #ifdef __x86_64__ > - + __builtin_ia32_tzcnt_u64 (x) > - + __builtin_ia32_lzcnt_u64 (x) > + + __builtin_ia32_tzcnt_u64 (y) > + + __builtin_ia32_lzcnt_u64 (y) > #endif > ; > } > > Jakub > > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)