[gcc r16-836] aarch64: Match unpredicated shift patterns for ADR, SRA and ADDHNB instructions

Dhruv Chawla via Gcc-cvs Fri, 23 May 2025 01:04:47 -0700

https://gcc.gnu.org/g:7e0149fdb01b595949a3a6add478b3eed9acf478


commit r16-836-g7e0149fdb01b595949a3a6add478b3eed9acf478
Author: Dhruv Chawla <dhr...@nvidia.com>
Date:   Fri May 9 01:47:45 2025 -0700

    aarch64: Match unpredicated shift patterns for ADR, SRA and ADDHNB 
instructions
    
    This patch modifies the shift expander to immediately lower constant
    shifts without unspec. It also modifies the ADR, SRA and ADDHNB patterns
    to match the lowered forms of the shifts, as the predicate register is
    not required for these instructions.
    
    Bootstrapped and regtested on aarch64-linux-gnu.
    
    Signed-off-by: Dhruv Chawla <dhr...@nvidia.com>
    Co-authored-by: Richard Sandiford <richard.sandif...@arm.com>
    
    gcc/ChangeLog:
    
            * config/aarch64/aarch64-sve.md (@aarch64_adr<mode>_shift):
            Match lowered form of ashift.
            (*aarch64_adr<mode>_shift): Likewise.
            (*aarch64_adr_shift_sxtw): Likewise.
            (*aarch64_adr_shift_uxtw): Likewise.
            (<ASHIFT:optab><mode>3): Check amount instead of operands[2] in
            aarch64_sve_<lr>shift_operand.
            (v<optab><mode>3): Generate unpredicated shifts for constant
            operands.
            (@aarch64_pred_<optab><mode>): Convert to a define_expand.
            (*aarch64_pred_<optab><mode>): Create define_insn_and_split pattern
            from @aarch64_pred_<optab><mode>.
            (*post_ra_v_ashl<mode>3): Rename to ...
            (aarch64_vashl<mode>3_const): ... this and remove reload 
requirement.
            (*post_ra_v_<optab><mode>3): Rename to ...
            (aarch64_v<optab><mode>3_const): ... this and remove reload
            requirement.
            * config/aarch64/aarch64-sve2.md
            (@aarch64_sve_add_<sve_int_op><mode>): Match lowered form of
            SHIFTRT.
            (*aarch64_sve2_sra<mode>): Likewise.
            (*bitmask_shift_plus<mode>): Match lowered form of lshiftrt.

Diff:
---
 gcc/config/aarch64/aarch64-sve.md  | 119 +++++++++++++++++++------------------
 gcc/config/aarch64/aarch64-sve2.md |  46 +++++---------
 2 files changed, 75 insertions(+), 90 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index bf7569f932b6..e1ec778b10df 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -4234,80 +4234,57 @@
 (define_expand "@aarch64_adr<mode>_shift"
   [(set (match_operand:SVE_FULL_SDI 0 "register_operand")
        (plus:SVE_FULL_SDI
-         (unspec:SVE_FULL_SDI
-           [(match_dup 4)
-            (ashift:SVE_FULL_SDI
-              (match_operand:SVE_FULL_SDI 2 "register_operand")
-              (match_operand:SVE_FULL_SDI 3 "const_1_to_3_operand"))]
-           UNSPEC_PRED_X)
+         (ashift:SVE_FULL_SDI
+           (match_operand:SVE_FULL_SDI 2 "register_operand")
+           (match_operand:SVE_FULL_SDI 3 "const_1_to_3_operand"))
          (match_operand:SVE_FULL_SDI 1 "register_operand")))]
   "TARGET_SVE && TARGET_NON_STREAMING"
-  {
-    operands[4] = CONSTM1_RTX (<VPRED>mode);
-  }
 )
 
-(define_insn_and_rewrite "*aarch64_adr<mode>_shift"
+(define_insn "*aarch64_adr<mode>_shift"
   [(set (match_operand:SVE_24I 0 "register_operand" "=w")
        (plus:SVE_24I
-         (unspec:SVE_24I
-           [(match_operand 4)
-            (ashift:SVE_24I
-              (match_operand:SVE_24I 2 "register_operand" "w")
-              (match_operand:SVE_24I 3 "const_1_to_3_operand"))]
-           UNSPEC_PRED_X)
+         (ashift:SVE_24I
+           (match_operand:SVE_24I 2 "register_operand" "w")
+           (match_operand:SVE_24I 3 "const_1_to_3_operand"))
          (match_operand:SVE_24I 1 "register_operand" "w")))]
   "TARGET_SVE && TARGET_NON_STREAMING"
   "adr\t%0.<Vctype>, [%1.<Vctype>, %2.<Vctype>, lsl %3]"
-  "&& !CONSTANT_P (operands[4])"
-  {
-    operands[4] = CONSTM1_RTX (<VPRED>mode);
-  }
 )
 
 ;; Same, but with the index being sign-extended from the low 32 bits.
 (define_insn_and_rewrite "*aarch64_adr_shift_sxtw"
   [(set (match_operand:VNx2DI 0 "register_operand" "=w")
        (plus:VNx2DI
-         (unspec:VNx2DI
-           [(match_operand 4)
-            (ashift:VNx2DI
-              (unspec:VNx2DI
-                [(match_operand 5)
-                 (sign_extend:VNx2DI
-                   (truncate:VNx2SI
-                     (match_operand:VNx2DI 2 "register_operand" "w")))]
-                UNSPEC_PRED_X)
-              (match_operand:VNx2DI 3 "const_1_to_3_operand"))]
-           UNSPEC_PRED_X)
+         (ashift:VNx2DI
+           (unspec:VNx2DI
+             [(match_operand 4)
+              (sign_extend:VNx2DI
+                (truncate:VNx2SI
+                  (match_operand:VNx2DI 2 "register_operand" "w")))]
+            UNSPEC_PRED_X)
+           (match_operand:VNx2DI 3 "const_1_to_3_operand"))
          (match_operand:VNx2DI 1 "register_operand" "w")))]
   "TARGET_SVE && TARGET_NON_STREAMING"
   "adr\t%0.d, [%1.d, %2.d, sxtw %3]"
-  "&& (!CONSTANT_P (operands[4]) || !CONSTANT_P (operands[5]))"
+  "&& !CONSTANT_P (operands[4])"
   {
-    operands[5] = operands[4] = CONSTM1_RTX (VNx2BImode);
+    operands[4] = CONSTM1_RTX (VNx2BImode);
   }
 )
 
 ;; Same, but with the index being zero-extended from the low 32 bits.
-(define_insn_and_rewrite "*aarch64_adr_shift_uxtw"
+(define_insn "*aarch64_adr_shift_uxtw"
   [(set (match_operand:VNx2DI 0 "register_operand" "=w")
        (plus:VNx2DI
-         (unspec:VNx2DI
-           [(match_operand 5)
-            (ashift:VNx2DI
-              (and:VNx2DI
-                (match_operand:VNx2DI 2 "register_operand" "w")
-                (match_operand:VNx2DI 4 "aarch64_sve_uxtw_immediate"))
-              (match_operand:VNx2DI 3 "const_1_to_3_operand"))]
-           UNSPEC_PRED_X)
+         (ashift:VNx2DI
+           (and:VNx2DI
+             (match_operand:VNx2DI 2 "register_operand" "w")
+             (match_operand:VNx2DI 4 "aarch64_sve_uxtw_immediate"))
+           (match_operand:VNx2DI 3 "const_1_to_3_operand"))
          (match_operand:VNx2DI 1 "register_operand" "w")))]
   "TARGET_SVE && TARGET_NON_STREAMING"
   "adr\t%0.d, [%1.d, %2.d, uxtw %3]"
-  "&& !CONSTANT_P (operands[5])"
-  {
-    operands[5] = CONSTM1_RTX (VNx2BImode);
-  }
 )
 
 ;; -------------------------------------------------------------------------
@@ -4899,7 +4876,7 @@
     if (CONST_INT_P (operands[2]))
       {
        amount = gen_const_vec_duplicate (<MODE>mode, operands[2]);
-       if (!aarch64_sve_<lr>shift_operand (operands[2], <MODE>mode))
+       if (!aarch64_sve_<lr>shift_operand (amount, <MODE>mode))
          amount = force_reg (<MODE>mode, amount);
       }
     else
@@ -4923,15 +4900,40 @@
          UNSPEC_PRED_X))]
   "TARGET_SVE"
   {
+    if (CONSTANT_P (operands[2]))
+      {
+       emit_insn (gen_aarch64_v<optab><mode>3_const (operands[0], operands[1],
+                                                     operands[2]));
+       DONE;
+      }
     operands[3] = aarch64_ptrue_reg (<VPRED>mode);
   }
 )
 
-;; Shift by a vector, predicated with a PTRUE.  We don't actually need
-;; the predicate for the first alternative, but using Upa or X isn't
-;; likely to gain much and would make the instruction seem less uniform
-;; to the register allocator.
-(define_insn_and_split "@aarch64_pred_<optab><mode>"
+;; Shift by a vector, predicated with a PTRUE.
+(define_expand "@aarch64_pred_<optab><mode>"
+  [(set (match_operand:SVE_I 0 "register_operand")
+       (unspec:SVE_I
+         [(match_operand:<VPRED> 1 "register_operand")
+          (ASHIFT:SVE_I
+            (match_operand:SVE_I 2 "register_operand")
+            (match_operand:SVE_I 3 "aarch64_sve_<lr>shift_operand"))]
+         UNSPEC_PRED_X))]
+  "TARGET_SVE"
+  {
+    if (CONSTANT_P (operands[3]))
+      {
+       emit_insn (gen_aarch64_v<optab><mode>3_const (operands[0], operands[2],
+                                                     operands[3]));
+       DONE;
+      }
+  }
+)
+
+;; We don't actually need the predicate for the first alternative, but
+;; using Upa or X isn't likely to gain much and would make the instruction
+;; seem less uniform to the register allocator.
+(define_insn_and_split "*aarch64_pred_<optab><mode>"
   [(set (match_operand:SVE_I 0 "register_operand")
        (unspec:SVE_I
          [(match_operand:<VPRED> 1 "register_operand")
@@ -4946,33 +4948,32 @@
      [ w        , Upl , w , 0     ; *              ] <shift>r\t%0.<Vetype>, 
%1/m, %3.<Vetype>, %2.<Vetype>
      [ ?&w      , Upl , w , w     ; yes            ] movprfx\t%0, 
%2\;<shift>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
   }
-  "&& reload_completed
-   && !register_operand (operands[3], <MODE>mode)"
+  "&& !register_operand (operands[3], <MODE>mode)"
   [(set (match_dup 0) (ASHIFT:SVE_I (match_dup 2) (match_dup 3)))]
   ""
 )
 
-;; Unpredicated shift operations by a constant (post-RA only).
+;; Unpredicated shift operations by a constant.
 ;; These are generated by splitting a predicated instruction whose
 ;; predicate is unused.
-(define_insn "*post_ra_v_ashl<mode>3"
+(define_insn "aarch64_vashl<mode>3_const"
   [(set (match_operand:SVE_I 0 "register_operand")
        (ashift:SVE_I
          (match_operand:SVE_I 1 "register_operand")
          (match_operand:SVE_I 2 "aarch64_simd_lshift_imm")))]
-  "TARGET_SVE && reload_completed"
+  "TARGET_SVE"
   {@ [ cons: =0 , 1 , 2   ]
      [ w       , w , vs1 ] add\t%0.<Vetype>, %1.<Vetype>, %1.<Vetype>
      [ w       , w , Dl  ] lsl\t%0.<Vetype>, %1.<Vetype>, #%2
   }
 )
 
-(define_insn "*post_ra_v_<optab><mode>3"
+(define_insn "aarch64_v<optab><mode>3_const"
   [(set (match_operand:SVE_I 0 "register_operand" "=w")
        (SHIFTRT:SVE_I
          (match_operand:SVE_I 1 "register_operand" "w")
          (match_operand:SVE_I 2 "aarch64_simd_rshift_imm")))]
-  "TARGET_SVE && reload_completed"
+  "TARGET_SVE"
   "<shift>\t%0.<Vetype>, %1.<Vetype>, #%2"
 )
 
diff --git a/gcc/config/aarch64/aarch64-sve2.md 
b/gcc/config/aarch64/aarch64-sve2.md
index 871cf0bd2e83..62524f36de65 100644
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -1932,40 +1932,27 @@
 (define_expand "@aarch64_sve_add_<sve_int_op><mode>"
   [(set (match_operand:SVE_FULL_I 0 "register_operand")
        (plus:SVE_FULL_I
-         (unspec:SVE_FULL_I
-           [(match_dup 4)
-            (SHIFTRT:SVE_FULL_I
-              (match_operand:SVE_FULL_I 2 "register_operand")
-              (match_operand:SVE_FULL_I 3 "aarch64_simd_rshift_imm"))]
-           UNSPEC_PRED_X)
-        (match_operand:SVE_FULL_I 1 "register_operand")))]
+         (SHIFTRT:SVE_FULL_I
+           (match_operand:SVE_FULL_I 2 "register_operand")
+           (match_operand:SVE_FULL_I 3 "aarch64_simd_rshift_imm"))
+         (match_operand:SVE_FULL_I 1 "register_operand")))]
   "TARGET_SVE2"
-  {
-    operands[4] = CONSTM1_RTX (<VPRED>mode);
-  }
 )
 
 ;; Pattern-match SSRA and USRA as a predicated operation whose predicate
 ;; isn't needed.
-(define_insn_and_rewrite "*aarch64_sve2_sra<mode>"
+(define_insn "*aarch64_sve2_sra<mode>"
   [(set (match_operand:SVE_FULL_I 0 "register_operand")
        (plus:SVE_FULL_I
-         (unspec:SVE_FULL_I
-           [(match_operand 4)
-            (SHIFTRT:SVE_FULL_I
-              (match_operand:SVE_FULL_I 2 "register_operand")
-              (match_operand:SVE_FULL_I 3 "aarch64_simd_rshift_imm"))]
-           UNSPEC_PRED_X)
+         (SHIFTRT:SVE_FULL_I
+           (match_operand:SVE_FULL_I 2 "register_operand")
+           (match_operand:SVE_FULL_I 3 "aarch64_simd_rshift_imm"))
         (match_operand:SVE_FULL_I 1 "register_operand")))]
   "TARGET_SVE2"
   {@ [ cons: =0 , 1 , 2 ; attrs: movprfx ]
      [ w        , 0 , w ; *              ] <sra_op>sra\t%0.<Vetype>, 
%2.<Vetype>, #%3
      [ ?&w      , w , w ; yes            ] movprfx\t%0, 
%1\;<sra_op>sra\t%0.<Vetype>, %2.<Vetype>, #%3
   }
-  "&& !CONSTANT_P (operands[4])"
-  {
-    operands[4] = CONSTM1_RTX (<VPRED>mode);
-  }
 )
 
 ;; SRSRA and URSRA.
@@ -2715,17 +2702,14 @@
 ;; Optimize ((a + b) >> n) where n is half the bitsize of the vector
 (define_insn "*bitmask_shift_plus<mode>"
   [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w")
-       (unspec:SVE_FULL_HSDI
-          [(match_operand:<VPRED> 1)
-           (lshiftrt:SVE_FULL_HSDI
-             (plus:SVE_FULL_HSDI
-               (match_operand:SVE_FULL_HSDI 2 "register_operand" "w")
-               (match_operand:SVE_FULL_HSDI 3 "register_operand" "w"))
-             (match_operand:SVE_FULL_HSDI 4
-                "aarch64_simd_shift_imm_vec_exact_top" ""))]
-          UNSPEC_PRED_X))]
+       (lshiftrt:SVE_FULL_HSDI
+         (plus:SVE_FULL_HSDI
+           (match_operand:SVE_FULL_HSDI 1 "register_operand" "w")
+           (match_operand:SVE_FULL_HSDI 2 "register_operand" "w"))
+         (match_operand:SVE_FULL_HSDI 3
+           "aarch64_simd_shift_imm_vec_exact_top" "")))]
   "TARGET_SVE2"
-  "addhnb\t%0.<Ventype>, %2.<Vetype>, %3.<Vetype>"
+  "addhnb\t%0.<Ventype>, %1.<Vetype>, %2.<Vetype>"
 )
 
 ;; -------------------------------------------------------------------------

[gcc r16-836] aarch64: Match unpredicated shift patterns for ADR, SRA and ADDHNB instructions

Reply via email to