This patch is inspired by a small code fragment in comment #3 of
bugzilla PR rtl-optimization/94804.  That snippet appears almost
unrelated to the topic of the PR, but recognizing __builtin_bswap64
from two __builtin_bswap32 calls, seems like a clever/useful trick.
GCC's optabs.c contains the inverse logic to expand bswap64 by
IORing two bswap32 calls, so this transformation/canonicalization
is safe, even on targets without suitable optab support.  But
on x86_64, the swap64 of the test case becomes a single instruction.


This patch has been tested on x86_64-pc-linux-gnu with a "make
bootstrap" and a "make -k check" with no new failures.
Ok for mainline?


2020-08-12  Roger Sayle  <ro...@nextmovesoftware.com>

gcc/ChangeLog
        * match.pd (((T)bswapX(x)<<C)|bswapX(x>>C) -> bswapY(x)):
        New simplifications to recognize __builtin_bswap{32,64}.

gcc/testsuite/ChangeLog
        * gcc.dg/fold-bswap-1.c: New test.


Thanks in advance,
Roger
--
Roger Sayle
NextMove Software
Cambridge, UK

diff --git a/gcc/match.pd b/gcc/match.pd
index 7e5c5a6..d4efbf3 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -3410,6 +3410,39 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
    (bswap (bitop:c (bswap @0) @1))
    (bitop @0 (bswap @1)))))
 
+/* Recognize ((T)bswap32(x)<<32)|bswap32(x>>32) as bswap64(x).  */
+(simplify
+  (bit_ior:c
+    (lshift
+      (convert (BUILT_IN_BSWAP32 (convert@4 @0)))
+      INTEGER_CST@1)
+    (convert (BUILT_IN_BSWAP32 (convert@5 (rshift @2
+                                                 INTEGER_CST@3)))))
+  (if (operand_equal_p (@0, @2, 0)
+       && types_match (type, uint64_type_node)
+       && types_match (TREE_TYPE (@0), uint64_type_node)
+       && types_match (TREE_TYPE (@4), uint32_type_node)
+       && types_match (TREE_TYPE (@5), uint32_type_node)
+       && wi::to_widest (@1) == 32
+       && wi::to_widest (@3) == 32)
+    (BUILT_IN_BSWAP64 @0)))
+
+/* Recognize ((T)bswap16(x)<<16)|bswap16(x>>16) as bswap32(x).  */
+(simplify
+  (bit_ior:c
+    (lshift
+      (convert (BUILT_IN_BSWAP16 (convert (bit_and @0
+                                                  INTEGER_CST@1))))
+      (INTEGER_CST@2))
+    (convert (BUILT_IN_BSWAP16 (convert (rshift @3
+                                               INTEGER_CST@4)))))
+  (if (operand_equal_p (@0, @3, 0)
+       && types_match (type, uint32_type_node)
+       && types_match (TREE_TYPE (@0), uint32_type_node)
+       && wi::to_widest (@1) == 65535
+       && wi::to_widest (@2) == 16
+       && wi::to_widest (@4) == 16)
+    (BUILT_IN_BSWAP32 @0)))
 
 /* Combine COND_EXPRs and VEC_COND_EXPRs.  */
 
diff --git a/gcc/testsuite/gcc.dg/fold-bswap-1.c 
b/gcc/testsuite/gcc.dg/fold-bswap-1.c
new file mode 100644
index 0000000..f14f731
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/fold-bswap-1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+unsigned int swap32(unsigned int x)
+{
+    unsigned int a = __builtin_bswap16(x);
+    x >>= 16;
+    a <<= 16;
+    return __builtin_bswap16(x) | a;
+}
+
+unsigned long swap64(unsigned long x)
+{
+    unsigned long a = __builtin_bswap32(x);
+    x >>= 32;
+    a <<= 32;
+    return __builtin_bswap32(x) | a;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_bswap32" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_bswap64" 1 "optimized" } } */
+

Reply via email to