This patch is inspired by a small code fragment in comment #3 of bugzilla PR rtl-optimization/94804. That snippet appears almost unrelated to the topic of the PR, but recognizing __builtin_bswap64 from two __builtin_bswap32 calls, seems like a clever/useful trick. GCC's optabs.c contains the inverse logic to expand bswap64 by IORing two bswap32 calls, so this transformation/canonicalization is safe, even on targets without suitable optab support. But on x86_64, the swap64 of the test case becomes a single instruction.
This patch has been tested on x86_64-pc-linux-gnu with a "make bootstrap" and a "make -k check" with no new failures. Ok for mainline? 2020-08-12 Roger Sayle <ro...@nextmovesoftware.com> gcc/ChangeLog * match.pd (((T)bswapX(x)<<C)|bswapX(x>>C) -> bswapY(x)): New simplifications to recognize __builtin_bswap{32,64}. gcc/testsuite/ChangeLog * gcc.dg/fold-bswap-1.c: New test. Thanks in advance, Roger -- Roger Sayle NextMove Software Cambridge, UK
diff --git a/gcc/match.pd b/gcc/match.pd index 7e5c5a6..d4efbf3 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -3410,6 +3410,39 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (bswap (bitop:c (bswap @0) @1)) (bitop @0 (bswap @1))))) +/* Recognize ((T)bswap32(x)<<32)|bswap32(x>>32) as bswap64(x). */ +(simplify + (bit_ior:c + (lshift + (convert (BUILT_IN_BSWAP32 (convert@4 @0))) + INTEGER_CST@1) + (convert (BUILT_IN_BSWAP32 (convert@5 (rshift @2 + INTEGER_CST@3))))) + (if (operand_equal_p (@0, @2, 0) + && types_match (type, uint64_type_node) + && types_match (TREE_TYPE (@0), uint64_type_node) + && types_match (TREE_TYPE (@4), uint32_type_node) + && types_match (TREE_TYPE (@5), uint32_type_node) + && wi::to_widest (@1) == 32 + && wi::to_widest (@3) == 32) + (BUILT_IN_BSWAP64 @0))) + +/* Recognize ((T)bswap16(x)<<16)|bswap16(x>>16) as bswap32(x). */ +(simplify + (bit_ior:c + (lshift + (convert (BUILT_IN_BSWAP16 (convert (bit_and @0 + INTEGER_CST@1)))) + (INTEGER_CST@2)) + (convert (BUILT_IN_BSWAP16 (convert (rshift @3 + INTEGER_CST@4))))) + (if (operand_equal_p (@0, @3, 0) + && types_match (type, uint32_type_node) + && types_match (TREE_TYPE (@0), uint32_type_node) + && wi::to_widest (@1) == 65535 + && wi::to_widest (@2) == 16 + && wi::to_widest (@4) == 16) + (BUILT_IN_BSWAP32 @0))) /* Combine COND_EXPRs and VEC_COND_EXPRs. */
diff --git a/gcc/testsuite/gcc.dg/fold-bswap-1.c b/gcc/testsuite/gcc.dg/fold-bswap-1.c new file mode 100644 index 0000000..f14f731 --- /dev/null +++ b/gcc/testsuite/gcc.dg/fold-bswap-1.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + +unsigned int swap32(unsigned int x) +{ + unsigned int a = __builtin_bswap16(x); + x >>= 16; + a <<= 16; + return __builtin_bswap16(x) | a; +} + +unsigned long swap64(unsigned long x) +{ + unsigned long a = __builtin_bswap32(x); + x >>= 32; + a <<= 32; + return __builtin_bswap32(x) | a; +} + +/* { dg-final { scan-tree-dump-times "__builtin_bswap32" 1 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "__builtin_bswap64" 1 "optimized" } } */ +