From: MITSUNARI Shigeo <[email protected]>

When the register allocator selects the MUL-based highpart pattern
(umuldi3_highpart) with the source value already in %rdx, it inserts
a redundant mov to %rax before the mul instruction.  Add a peephole2
that detects this mov + mul sequence and converts it to a single mulx,
eliminating the extra mov.

This improves inlined loops that perform multiple unsigned divisions
by constants.  For example, a loop with three div-by-constant
operations now generates 15 instructions (matching LLVM) instead
of 18.

Before (loop body excerpt):
        mov     rax, rdx
        mul     r9

After:
        mulx    rdx, rax, r9

gcc/ChangeLog:

        * config/i386/i386.md: Add peephole2 to convert
        mov + umul_highpart to mulx on BMI2 targets.

gcc/testsuite/ChangeLog:

        * gcc.target/i386/bmi2-mulx-highpart-2.c: New test.

Signed-off-by: MITSUNARI Shigeo <[email protected]>
---
 gcc/config/i386/i386.md                       | 17 +++++++++++++++++
 .../gcc.target/i386/bmi2-mulx-highpart-2.c    | 19 +++++++++++++++++++
 2 files changed, 36 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/bmi2-mulx-highpart-2.c

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 472f9d41332..1c394690b04 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -11522,6 +11522,23 @@
    (set_attr "prefix" "vex")
    (set_attr "mode" "<MODE>")])
 
+;; Convert mov + highpart mul to mulx when the mov source is %rdx.
+;; mov %rdx, %rax; mulq %src -> mulx %src, %rax, %out
+(define_peephole2
+  [(set (match_operand:DWIH 0 "register_operand")
+       (match_operand:DWIH 1 "register_operand"))
+   (parallel [(set (match_operand:DWIH 2 "register_operand")
+                  (umul_highpart:DWIH (match_dup 0)
+                       (match_operand:DWIH 3 "nonimmediate_operand")))
+             (clobber (match_dup 0))
+             (clobber (reg:CC FLAGS_REG))])]
+  "TARGET_BMI2
+   && REGNO (operands[1]) == DX_REG
+   && REGNO (operands[0]) != REGNO (operands[2])"
+  [(parallel [(set (match_dup 2)
+                  (umul_highpart:DWIH (match_dup 1) (match_dup 3)))
+             (clobber (match_dup 0))])])
+
 ;; Highpart multiplication patterns
 (define_insn "<s>mul<mode>3_highpart"
   [(set (match_operand:DWIH 0 "register_operand" "=d")
diff --git a/gcc/testsuite/gcc.target/i386/bmi2-mulx-highpart-2.c 
b/gcc/testsuite/gcc.target/i386/bmi2-mulx-highpart-2.c
new file mode 100644
index 00000000000..be56cf15d07
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bmi2-mulx-highpart-2.c
@@ -0,0 +1,19 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mbmi2" } */
+/* { dg-final { check-function-bodies "**" "" "" { target *-*-linux* *-*-gnu* 
} } } */
+
+/*
+**div7loop:
+**...
+**     mulx    %rsi, %rax, %rdx
+**...
+*/
+
+unsigned int
+div7loop (unsigned int x)
+{
+  for (int i = 0; i < 10000; i++) {
+    x ^= (i ^ x) / 7;
+  }
+  return x;
+}
-- 
2.43.0

Reply via email to