[PATCH]AArch64 Change canonization of smlal and smlsl in order to be able to optimize the vec_dup

Tamar Christina via Gcc-patches Mon, 01 Feb 2021 04:39:13 -0800

Hi All,

g:87301e3956d44ad45e384a8eb16c79029d20213a and
g:ee4c4fe289e768d3c6b6651c8bfa3fdf458934f4 changed the intrinsics to be
proper RTL but accidentally ended up creating a regression because of the
ordering in the RTL pattern.


The existing RTL that combine should try to match to remove the vec_dup is 
aarch64_vec_<su>mlal_lane<Qlane> and aarch64_vec_<su>mult_lane<Qlane> which
expects the select register to be the second operand of mult.

The pattern introduced has it as the first operand so combine was unable to
remove the vec_dup.  This flips the order such that the patterns optimize
correctly.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        * config/aarch64/aarch64-simd.md (aarch64_<su>mlal_n<mode>,
        aarch64_<su>mlsl<mode>, aarch64_<su>mlsl_n<mode>): Flip mult operands.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/advsimd-intrinsics/smlal-smlsl-mull-optimized.c: 
New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 
bca2d8a3437fdcee77c7c357663c78c418b32a88..d1858663a4e78c0861d902b37e93c0b00d75e661
 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1950,10 +1950,10 @@ (define_insn "aarch64_<su>mlal_n<mode>"
         (plus:<VWIDE>
           (mult:<VWIDE>
             (ANY_EXTEND:<VWIDE>
-              (vec_duplicate:VD_HSI
-                     (match_operand:<VEL> 3 "register_operand" "<h_con>")))
+              (match_operand:VD_HSI 2 "register_operand" "w"))
             (ANY_EXTEND:<VWIDE>
-              (match_operand:VD_HSI 2 "register_operand" "w")))
+              (vec_duplicate:VD_HSI
+                     (match_operand:<VEL> 3 "register_operand" "<h_con>"))))
           (match_operand:<VWIDE> 1 "register_operand" "0")))]
   "TARGET_SIMD"
   "<su>mlal\t%0.<Vwtype>, %2.<Vtype>, %3.<Vetype>[0]"
@@ -1980,10 +1980,10 @@ (define_insn "aarch64_<su>mlsl_n<mode>"
           (match_operand:<VWIDE> 1 "register_operand" "0")
           (mult:<VWIDE>
             (ANY_EXTEND:<VWIDE>
-              (vec_duplicate:VD_HSI
-                     (match_operand:<VEL> 3 "register_operand" "<h_con>")))
+              (match_operand:VD_HSI 2 "register_operand" "w"))
             (ANY_EXTEND:<VWIDE>
-              (match_operand:VD_HSI 2 "register_operand" "w")))))]
+              (vec_duplicate:VD_HSI
+                     (match_operand:<VEL> 3 "register_operand" "<h_con>"))))))]
   "TARGET_SIMD"
   "<su>mlsl\t%0.<Vwtype>, %2.<Vtype>, %3.<Vetype>[0]"
   [(set_attr "type" "neon_mla_<Vetype>_long")]
@@ -2078,10 +2078,10 @@ (define_insn "aarch64_<su>mull_n<mode>"
   [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
         (mult:<VWIDE>
           (ANY_EXTEND:<VWIDE>
-            (vec_duplicate:<VCOND>
-             (match_operand:<VEL> 2 "register_operand" "<h_con>")))
+            (match_operand:VD_HSI 1 "register_operand" "w"))
           (ANY_EXTEND:<VWIDE>
-            (match_operand:VD_HSI 1 "register_operand" "w"))))]
+            (vec_duplicate:<VCOND>
+             (match_operand:<VEL> 2 "register_operand" "<h_con>")))))]
   "TARGET_SIMD"
   "<su>mull\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[0]"
   [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
diff --git 
a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/smlal-smlsl-mull-optimized.c
 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/smlal-smlsl-mull-optimized.c
new file mode 100644
index 
0000000000000000000000000000000000000000..1e963e5002e666e32e12b2eef965b206c7344015
--- /dev/null
+++ 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/smlal-smlsl-mull-optimized.c
@@ -0,0 +1,45 @@
+/* { dg-do compile { target aarch64-*-* } } */
+
+#include <arm_neon.h>
+
+/*
+**add:
+**     smlal   v0.4s, v1.4h, v2.h[3]
+**     ret
+*/
+
+int32x4_t add(int32x4_t acc, int16x4_t b, int16x4_t c) {
+    return vmlal_n_s16(acc, b, c[3]);
+}
+
+/*
+**sub:
+**     smlsl   v0.4s, v1.4h, v2.h[3]
+**     ret
+*/
+
+int32x4_t sub(int32x4_t acc, int16x4_t b, int16x4_t c) {
+    return vmlsl_n_s16(acc, b, c[3]);
+}
+
+/*
+**smull:
+**     smull   v0.4s, v1.4h, v2.h[3]
+**     ret
+*/
+
+int32x4_t smull(int16x4_t b, int16x4_t c) {
+    return vmull_n_s16(b, c[3]);
+}
+
+/*
+**umull:
+**     umull   v0.4s, v1.4h, v2.h[3]
+**     ret
+*/
+
+uint32x4_t umull(uint16x4_t b, uint16x4_t c) {
+    return vmull_n_u16(b, c[3]);
+}
+
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" {-O[^0]} } } */


--

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index bca2d8a3437fdcee77c7c357663c78c418b32a88..d1858663a4e78c0861d902b37e93c0b00d75e661 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1950,10 +1950,10 @@ (define_insn "aarch64_<su>mlal_n<mode>"
         (plus:<VWIDE>
           (mult:<VWIDE>
             (ANY_EXTEND:<VWIDE>
-              (vec_duplicate:VD_HSI
-	              (match_operand:<VEL> 3 "register_operand" "<h_con>")))
+              (match_operand:VD_HSI 2 "register_operand" "w"))
             (ANY_EXTEND:<VWIDE>
-              (match_operand:VD_HSI 2 "register_operand" "w")))
+              (vec_duplicate:VD_HSI
+	              (match_operand:<VEL> 3 "register_operand" "<h_con>"))))
           (match_operand:<VWIDE> 1 "register_operand" "0")))]
   "TARGET_SIMD"
   "<su>mlal\t%0.<Vwtype>, %2.<Vtype>, %3.<Vetype>[0]"
@@ -1980,10 +1980,10 @@ (define_insn "aarch64_<su>mlsl_n<mode>"
           (match_operand:<VWIDE> 1 "register_operand" "0")
           (mult:<VWIDE>
             (ANY_EXTEND:<VWIDE>
-              (vec_duplicate:VD_HSI
-	              (match_operand:<VEL> 3 "register_operand" "<h_con>")))
+              (match_operand:VD_HSI 2 "register_operand" "w"))
             (ANY_EXTEND:<VWIDE>
-              (match_operand:VD_HSI 2 "register_operand" "w")))))]
+              (vec_duplicate:VD_HSI
+	              (match_operand:<VEL> 3 "register_operand" "<h_con>"))))))]
   "TARGET_SIMD"
   "<su>mlsl\t%0.<Vwtype>, %2.<Vtype>, %3.<Vetype>[0]"
   [(set_attr "type" "neon_mla_<Vetype>_long")]
@@ -2078,10 +2078,10 @@ (define_insn "aarch64_<su>mull_n<mode>"
   [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
         (mult:<VWIDE>
           (ANY_EXTEND:<VWIDE>
-            (vec_duplicate:<VCOND>
-	      (match_operand:<VEL> 2 "register_operand" "<h_con>")))
+            (match_operand:VD_HSI 1 "register_operand" "w"))
           (ANY_EXTEND:<VWIDE>
-            (match_operand:VD_HSI 1 "register_operand" "w"))))]
+            (vec_duplicate:<VCOND>
+	      (match_operand:<VEL> 2 "register_operand" "<h_con>")))))]
   "TARGET_SIMD"
   "<su>mull\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[0]"
   [(set_attr "type" "neon_mul_<Vetype>_scalar_long")]
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/smlal-smlsl-mull-optimized.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/smlal-smlsl-mull-optimized.c
new file mode 100644
index 0000000000000000000000000000000000000000..1e963e5002e666e32e12b2eef965b206c7344015
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/smlal-smlsl-mull-optimized.c
@@ -0,0 +1,45 @@
+/* { dg-do compile { target aarch64-*-* } } */
+
+#include <arm_neon.h>
+
+/*
+**add:
+**     smlal   v0.4s, v1.4h, v2.h[3]
+**     ret
+*/
+
+int32x4_t add(int32x4_t acc, int16x4_t b, int16x4_t c) {
+    return vmlal_n_s16(acc, b, c[3]);
+}
+
+/*
+**sub:
+**     smlsl   v0.4s, v1.4h, v2.h[3]
+**     ret
+*/
+
+int32x4_t sub(int32x4_t acc, int16x4_t b, int16x4_t c) {
+    return vmlsl_n_s16(acc, b, c[3]);
+}
+
+/*
+**smull:
+**     smull   v0.4s, v1.4h, v2.h[3]
+**     ret
+*/
+
+int32x4_t smull(int16x4_t b, int16x4_t c) {
+    return vmull_n_s16(b, c[3]);
+}
+
+/*
+**umull:
+**     umull   v0.4s, v1.4h, v2.h[3]
+**     ret
+*/
+
+uint32x4_t umull(uint16x4_t b, uint16x4_t c) {
+    return vmull_n_u16(b, c[3]);
+}
+
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" {-O[^0]} } } */

[PATCH]AArch64 Change canonization of smlal and smlsl in order to be able to optimize the vec_dup

Reply via email to