PowerPC ISA 3.0 does not have a byte-reverse instruction that operates on the
GPRs, but it does have vector byte swap half-word, word, double-word operations
in the VSX registers.  The enclosed patch enables generation of the byte
revseral instructions for register-register operations.  It still prefers to
generate the load with byte reverse (L{H,W,D}BRX) or store with byte reverse
(ST{H,W,D}BRX) instructions over the register sequence.

For 16-bit and 32-bit byte swaps, it typically does the tradational operation
in GPR registers, but it will generate XXBR{H,W} if the values are in vector
registers.

For 64-bit swaps, it no longer generates the 9 instruction sequence in favor of
XXBRD.  I did some timing runs on a prototype power9 system, and it was
slightly faster to do direct move to the vecter unit, XXBRD, and direct move
back to a GPR than the traditional sequence.

I did bootstraps on little endian Power8 and Power9 systems (with the default
cpu set to power8 and power9 respectively).  There were no regressions.  Can I
check this patch into the trunk?

[gcc]
2017-11-08  Michael Meissner  <meiss...@linux.vnet.ibm.com>

        * config/rs6000/rs6000.md (bswaphi2_reg): On ISA 3.0 systems,
        enable generating XXBR{H,W} if the value is in a vector
        register.
        (bswapsi2_reg): Likewise.
        (bswapdi2_reg): On ISA 3.0 systems, use XXBRD to do bswap64
        instead of doing the GPR sequence used on previoius machines.
        (bswapdi2_xxbrd): Likewise.
        (bswapdi2_reg splitters): Use int_reg_operand instead of
        gpc_reg_operand to not match when XXBRD is generated.

[gcc/testsuite]
2017-11-08  Michael Meissner  <meiss...@linux.vnet.ibm.com>

        * gcc.target/powerpc/p9-xxbr-3.c: New test.

-- 
Michael Meissner, IBM
IBM, M/S 2506R, 550 King Street, Littleton, MA 01460-6245, USA
email: meiss...@linux.vnet.ibm.com, phone: +1 (978) 899-4797
Index: gcc/config/rs6000/rs6000.md
===================================================================
--- gcc/config/rs6000/rs6000.md (revision 254516)
+++ gcc/config/rs6000/rs6000.md (working copy)
@@ -2432,13 +2432,15 @@ (define_insn "bswap<mode>2_store"
   [(set_attr "type" "store")])
 
 (define_insn_and_split "bswaphi2_reg"
-  [(set (match_operand:HI 0 "gpc_reg_operand" "=&r")
+  [(set (match_operand:HI 0 "gpc_reg_operand" "=&r,wo")
        (bswap:HI
-        (match_operand:HI 1 "gpc_reg_operand" "r")))
-   (clobber (match_scratch:SI 2 "=&r"))]
+        (match_operand:HI 1 "gpc_reg_operand" "r,wo")))
+   (clobber (match_scratch:SI 2 "=&r,X"))]
   ""
-  "#"
-  "reload_completed"
+  "@
+   #
+   xxbrh %x0,%x1"
+  "reload_completed && int_reg_operand (operands[0], HImode)"
   [(set (match_dup 3)
        (and:SI (lshiftrt:SI (match_dup 4)
                             (const_int 8))
@@ -2454,18 +2456,20 @@ (define_insn_and_split "bswaphi2_reg"
   operands[3] = simplify_gen_subreg (SImode, operands[0], HImode, 0);
   operands[4] = simplify_gen_subreg (SImode, operands[1], HImode, 0);
 }
-  [(set_attr "length" "12")
-   (set_attr "type" "*")])
+  [(set_attr "length" "12,4")
+   (set_attr "type" "*,vecperm")])
 
 ;; We are always BITS_BIG_ENDIAN, so the bit positions below in
 ;; zero_extract insns do not change for -mlittle.
 (define_insn_and_split "bswapsi2_reg"
-  [(set (match_operand:SI 0 "gpc_reg_operand" "=&r")
+  [(set (match_operand:SI 0 "gpc_reg_operand" "=&r,wo")
        (bswap:SI
-        (match_operand:SI 1 "gpc_reg_operand" "r")))]
+        (match_operand:SI 1 "gpc_reg_operand" "r,wo")))]
   ""
-  "#"
-  "reload_completed"
+  "@
+   #
+   xxbrw %x0,%x1"
+  "reload_completed && int_reg_operand (operands[0], SImode)"
   [(set (match_dup 0)                                  ; DABC
        (rotate:SI (match_dup 1)
                   (const_int 24)))
@@ -2481,7 +2485,9 @@ (define_insn_and_split "bswapsi2_reg"
                        (const_int 255))
                (and:SI (match_dup 0)
                        (const_int -256))))]
-  "")
+  ""
+  [(set_attr "length" "12,4")
+   (set_attr "type" "*,vecperm")])
 
 ;; On systems with LDBRX/STDBRX generate the loads/stores directly, just like
 ;; we do for L{H,W}BRX and ST{H,W}BRX above.  If not, we have to generate more
@@ -2507,6 +2513,8 @@ (define_expand "bswapdi2"
        emit_insn (gen_bswapdi2_load (dest, src));
       else if (MEM_P (dest))
        emit_insn (gen_bswapdi2_store (dest, src));
+      else if (TARGET_P9_VECTOR)
+       emit_insn (gen_bswapdi2_xxbrd (dest, src));
       else
        emit_insn (gen_bswapdi2_reg (dest, src));
       DONE;
@@ -2537,6 +2545,13 @@ (define_insn "bswapdi2_store"
   "stdbrx %1,%y0"
   [(set_attr "type" "store")])
 
+(define_insn "bswapdi2_xxbrd"
+  [(set (match_operand:DI 0 "gpc_reg_operand" "=wo")
+       (bswap:DI (match_operand:DI 1 "gpc_reg_operand" "wo")))]
+  "TARGET_POWERPC64 && TARGET_P9_VECTOR"
+  "xxbrd %x0,%x1"
+  [(set_attr "type" "vecperm")])
+
 (define_insn "bswapdi2_reg"
   [(set (match_operand:DI 0 "gpc_reg_operand" "=&r")
        (bswap:DI (match_operand:DI 1 "gpc_reg_operand" "r")))
@@ -2544,7 +2559,8 @@ (define_insn "bswapdi2_reg"
    (clobber (match_scratch:DI 3 "=&r"))]
   "TARGET_POWERPC64 && TARGET_LDBRX"
   "#"
-  [(set_attr "length" "36")])
+  [(set_attr "length" "36")
+   (set_attr "type" "*")])
 
 ;; Non-power7/cell, fall back to use lwbrx/stwbrx
 (define_insn "*bswapdi2_64bit"
@@ -2560,7 +2576,7 @@ (define_insn "*bswapdi2_64bit"
   [(set_attr "length" "16,12,36")])
 
 (define_split
-  [(set (match_operand:DI 0 "gpc_reg_operand" "")
+  [(set (match_operand:DI 0 "int_reg_operand" "")
        (bswap:DI (match_operand:DI 1 "indexed_or_indirect_operand" "")))
    (clobber (match_operand:DI 2 "gpc_reg_operand" ""))
    (clobber (match_operand:DI 3 "gpc_reg_operand" ""))]
@@ -2625,7 +2641,7 @@ (define_split
 
 (define_split
   [(set (match_operand:DI 0 "indexed_or_indirect_operand" "")
-       (bswap:DI (match_operand:DI 1 "gpc_reg_operand" "")))
+       (bswap:DI (match_operand:DI 1 "int_reg_operand" "")))
    (clobber (match_operand:DI 2 "gpc_reg_operand" ""))
    (clobber (match_operand:DI 3 "gpc_reg_operand" ""))]
   "TARGET_POWERPC64 && !TARGET_LDBRX && reload_completed"
@@ -2687,10 +2703,10 @@ (define_split
 }")
 
 (define_split
-  [(set (match_operand:DI 0 "gpc_reg_operand" "")
-       (bswap:DI (match_operand:DI 1 "gpc_reg_operand" "")))
-   (clobber (match_operand:DI 2 "gpc_reg_operand" ""))
-   (clobber (match_operand:DI 3 "gpc_reg_operand" ""))]
+  [(set (match_operand:DI 0 "int_reg_operand" "")
+       (bswap:DI (match_operand:DI 1 "int_reg_operand" "")))
+   (clobber (match_operand:DI 2 "int_reg_operand" ""))
+   (clobber (match_operand:DI 3 "int_reg_operand" ""))]
   "TARGET_POWERPC64 && reload_completed"
   [(const_int 0)]
   "
@@ -2722,9 +2738,9 @@ (define_insn "bswapdi2_32bit"
   [(set_attr "length" "16,12,36")])
 
 (define_split
-  [(set (match_operand:DI 0 "gpc_reg_operand" "")
+  [(set (match_operand:DI 0 "int_reg_operand" "")
        (bswap:DI (match_operand:DI 1 "indexed_or_indirect_operand" "")))
-   (clobber (match_operand:SI 2 "gpc_reg_operand" ""))]
+   (clobber (match_operand:SI 2 "int_reg_operand" ""))]
   "!TARGET_POWERPC64 && reload_completed"
   [(const_int 0)]
   "
Index: gcc/testsuite/gcc.target/powerpc/p9-xxbr-3.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/p9-xxbr-3.c        (nonexistent)
+++ gcc/testsuite/gcc.target/powerpc/p9-xxbr-3.c        (working copy)
@@ -0,0 +1,99 @@
+/* { dg-do compile { target { powerpc*-*-* && lp64 } } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-mpower9-vector -O2" } */
+
+/* Verify that the XXBR{H,W} instructions are generated if the value is
+   forced to be in a vector register, and XXBRD is generated all of the
+   time for register bswap64's.  */
+
+unsigned short
+do_bswap16_mem (unsigned short *p)
+{
+  return __builtin_bswap16 (*p);       /* LHBRX.  */
+}
+
+unsigned short
+do_bswap16_reg (unsigned short a)
+{
+  return __builtin_bswap16 (a);                /* gpr sequences.  */
+}
+
+void
+do_bswap16_store (unsigned short *p, unsigned short a)
+{
+  *p = __builtin_bswap16 (a);          /* STHBRX.  */
+}
+
+unsigned short
+do_bswap16_vect (unsigned short a)
+{
+  __asm__ (" # %x0" : "+v" (a));
+  return __builtin_bswap16 (a);                /* XXBRW.  */
+}
+
+unsigned int
+do_bswap32_mem (unsigned int *p)
+{
+  return __builtin_bswap32 (*p);       /* LWBRX.  */
+}
+
+unsigned int
+do_bswap32_reg (unsigned int a)
+{
+  return __builtin_bswap32 (a);                /* gpr sequences.  */
+}
+
+void
+do_bswap32_store (unsigned int *p, unsigned int a)
+{
+  *p = __builtin_bswap32 (a);          /* STWBRX.  */
+}
+
+unsigned int
+do_bswap32_vect (unsigned int a)
+{
+  __asm__ (" # %x0" : "+v" (a));
+  return __builtin_bswap32 (a);                /* XXBRW.  */
+}
+
+unsigned long
+do_bswap64_mem (unsigned long *p)
+{
+  return __builtin_bswap64 (*p);       /* LDBRX.  */
+}
+
+unsigned long
+do_bswap64_reg (unsigned long a)
+{
+  return __builtin_bswap64 (a);                /* gpr sequences.  */
+}
+
+void
+do_bswap64_store (unsigned long *p, unsigned int a)
+{
+  *p = __builtin_bswap64 (a);          /* STDBRX.  */
+}
+
+double
+do_bswap64_double (unsigned long a)
+{
+  return (double) __builtin_bswap64 (a);       /* XXBRD.  */
+}
+
+unsigned long
+do_bswap64_vect (unsigned long a)
+{
+  __asm__ (" # %x0" : "+v" (a));       /* XXBRD.  */
+  return __builtin_bswap64 (a);
+}
+
+/* Make sure XXBR{H,W,D} is not generated by default.  */
+/* { dg-final { scan-assembler-times "xxbrd"  3  } } */
+/* { dg-final { scan-assembler-times "xxbrh"  1  } } */
+/* { dg-final { scan-assembler-times "xxbrw"  1  } } */
+/* { dg-final { scan-assembler-times "ldbrx"  1  } } */
+/* { dg-final { scan-assembler-times "lhbrx"  1  } } */
+/* { dg-final { scan-assembler-times "lwbrx"  1  } } */
+/* { dg-final { scan-assembler-times "stdbrx" 1  } } */
+/* { dg-final { scan-assembler-times "sthbrx" 1  } } */
+/* { dg-final { scan-assembler-times "stwbrx" 1  } } */

Reply via email to