I have pushed the attached patch, which Andrew has preapproved for GCC 16 as a performance regression fix.

-Sandra
From 10545cbf39b0ae432ebce40ddae811dfe5668f9a Mon Sep 17 00:00:00 2001
From: Sandra Loosemore <[email protected]>
Date: Mon, 23 Feb 2026 16:10:01 +0000
Subject: [PATCH] amdgcn: prefer scalar ALU instructions over vector
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch fixes a performance regression that crept in with commit
gcc-16-2237-g0eee2dd2865 ("Don't clobber VCC if we don't need to").
Removing the clobber from vector-register ALU insn pattern
alternatives seems to make LRA think those alternatives are less
costly than the scalar-register alternatives, which do need the
clobber, resulting in extra move instructions instead.

In this patch I've adjusted all the ALU patterns (not just the ones
modified by the above-mentioned commit) where the scalar-register
alternatives clobber and vector-register alternatives don't, to
slightly disparage the latter using the "?" constraint modifier.

I also fixed a related issue: the comment on addptrdi3 says the
scalar-register alternative is preferred, but it was listed after the
vector alternative, so other things being equal it would still prefer
the vector expansion.  I've not changed the constraints, but simply
flipped the order of the alternatives.

Thanks to Tobias Burnus for identifying the commit that caused the
performance regression, Andrew Stubbs for making the connection between
clobbers and LRA costs, and Arsen Arsenović for setting up the benchmark
scripting I used to compare results.

gcc/ChangeLog
	* config/gcn/gcn.md (addsi3): Disparage vector alternative.
	(addptrdi3): Reverse order of vector and scalar alternatives
	to prefer the latter.
	(subsi3): Disparage vector alternatives.
	(mulsi3): Likewise.
	(muldi3): Likewise.
	(bitunop <expander>si2): Likewise.
	(vec_and_scalar_com <expander>si3): Likewise.
	(vec_and_scalar_nocom <expander>si3): Likewise.
	(one_cmpldi2): Likewise.
	(vec_and_scalar64_com <expander>di3): Likwise.
	(vec_and_scalar64_nocom <expander>di3): Likwise.
---
 gcc/config/gcn/gcn.md | 52 +++++++++++++++++++++----------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
index 5957b29f748..c49bd6cf8bd 100644
--- a/gcc/config/gcn/gcn.md
+++ b/gcc/config/gcn/gcn.md
@@ -1127,7 +1127,7 @@
 ;; {{{ ALU special cases: Plus
 
 (define_insn "addsi3"
-  [(set (match_operand:SI 0 "register_operand"         "= Sg, Sg, Sg,   v")
+  [(set (match_operand:SI 0 "register_operand"         "= Sg, Sg, Sg,  ?v")
         (plus:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA,  0,SgA,   v")
 		 (match_operand:SI 2 "gcn_alu_operand" " SgA,SgJ,  B,vBSv")))
    (clobber (match_scratch:BI 3			       "= cs, cs, cs,   X"))]
@@ -1285,23 +1285,14 @@
 ; The SGPR alternative is preferred as it is typically used with mov_sgprbase.
 
 (define_insn "addptrdi3"
-  [(set (match_operand:DI 0 "register_operand"		 "= v, Sg")
+  [(set (match_operand:DI 0 "register_operand"		 "= Sg, v")
     (unspec:DI [
-	(plus:DI (match_operand:DI 1 "register_operand"	 "^v0,Sg0")
-		 (match_operand:DI 2 "nonmemory_operand" "vDA,SgDB"))]
+	(plus:DI (match_operand:DI 1 "register_operand"	 " Sg0,^v0")
+		 (match_operand:DI 2 "nonmemory_operand" "SgDB,vDA"))]
 	UNSPEC_ADDPTR))]
   ""
   {
     if (which_alternative == 0)
-      {
-	rtx new_operands[4] = { operands[0], operands[1], operands[2],
-				gen_rtx_REG (DImode, CC_SAVE_REG) };
-
-	output_asm_insn ("v_add_co_u32\t%L0, %3, %L2, %L1", new_operands);
-	output_asm_insn ("{v_addc_co_u32|v_add_co_ci_u32}\t%H0, %3, %H2, %H1, %3",
-			 new_operands);
-      }
-    else
       {
 	rtx new_operands[4] = { operands[0], operands[1], operands[2],
 				gen_rtx_REG (BImode, CC_SAVE_REG) };
@@ -1311,17 +1302,26 @@
 	output_asm_insn ("s_addc_u32\t%H0, %H1, %H2", new_operands);
 	output_asm_insn ("s_cmpk_lg_u32\t%3, 0", new_operands);
       }
+    else
+      {
+	rtx new_operands[4] = { operands[0], operands[1], operands[2],
+				gen_rtx_REG (DImode, CC_SAVE_REG) };
+
+	output_asm_insn ("v_add_co_u32\t%L0, %3, %L2, %L1", new_operands);
+	output_asm_insn ("{v_addc_co_u32|v_add_co_ci_u32}\t%H0, %3, %H2, %H1, %3",
+			 new_operands);
+      }
 
     return "";
   }
-  [(set_attr "type" "vmult,mult")
-   (set_attr "length" "16,24")])
+  [(set_attr "type" "mult,vmult")
+   (set_attr "length" "24,16")])
 
 ;; }}}
 ;; {{{ ALU special cases: Minus
 
 (define_insn "subsi3"
-  [(set (match_operand:SI 0 "register_operand"          "=Sg, Sg,    v,   v")
+  [(set (match_operand:SI 0 "register_operand"          "=Sg, Sg,   ?v,  ?v")
 	(minus:SI (match_operand:SI 1 "gcn_alu_operand" "SgA,SgA,    v,vBSv")
 		  (match_operand:SI 2 "gcn_alu_operand" "SgA,  B, vBSv,   v")))
    (clobber (match_scratch:BI 3				"=cs, cs,    X,   X"))]
@@ -1427,7 +1427,7 @@
 ; The "s_mulk_i32" variant sets SCC to indicate overflow (which we don't care
 ; about here, but we need to indicate the clobbering).
 (define_insn "mulsi3"
-  [(set (match_operand:SI 0 "register_operand"	       "= Sg,Sg, Sg,   v")
+  [(set (match_operand:SI 0 "register_operand"	       "= Sg,Sg, Sg,  ?v")
         (mult:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA, 0,SgA,   v")
 		 (match_operand:SI 2 "gcn_alu_operand" " SgA, J,  B,vASv")))
    (clobber (match_scratch:BI 3				 "=X,cs,  X,   X"))]
@@ -1556,9 +1556,9 @@
   })
 
 (define_insn_and_split "muldi3"
-  [(set (match_operand:DI 0 "register_operand"         "=&Sg,&Sg, &v,&v")
-	(mult:DI (match_operand:DI 1 "register_operand" "%Sg, Sg,  v, v")
-		 (match_operand:DI 2 "nonmemory_operand" "Sg,  i,vSv, A")))
+  [(set (match_operand:DI 0 "register_operand"         "=&Sg,&Sg,?&v,?&v")
+	(mult:DI (match_operand:DI 1 "register_operand" "%Sg, Sg,  v,  v")
+		 (match_operand:DI 2 "nonmemory_operand" "Sg,  i,vSv,  A")))
    (clobber (match_scratch:SI 3 "=&Sg,&Sg,&v,&v"))
    (clobber (match_scratch:BI 4  "=cs, cs, X, X"))]
   ""
@@ -1613,7 +1613,7 @@
 (define_code_attr popcount_extra_op [(not "") (popcount ", 0")])
 
 (define_insn "<expander>si2"
-  [(set (match_operand:SI 0 "register_operand"  "=Sg,   v")
+  [(set (match_operand:SI 0 "register_operand"  "=Sg,  ?v")
         (bitunop:SI
 	  (match_operand:SI 1 "gcn_alu_operand" "SgB,vSvB")))
    (clobber (match_scratch:BI 2			"=cs,   X"))]
@@ -1691,7 +1691,7 @@
 (define_code_iterator vec_and_scalar_nocom [ashift lshiftrt ashiftrt])
 
 (define_insn "<expander>si3"
-  [(set (match_operand:SI 0 "gcn_valu_dst_operand"    "= Sg,   v,RD")
+  [(set (match_operand:SI 0 "gcn_valu_dst_operand"    "= Sg,  ?v,?RD")
         (vec_and_scalar_com:SI
 	  (match_operand:SI 1 "gcn_valu_src0_operand" "%SgA,vSvB, 0")
 	  (match_operand:SI 2 "gcn_alu_operand"	      " SgB,   v, v")))
@@ -1705,7 +1705,7 @@
    (set_attr "length" "8")])
 
 (define_insn "<expander>si3"
-  [(set (match_operand:SI 0 "register_operand"  "=Sg, Sg,   v")
+  [(set (match_operand:SI 0 "register_operand"  "=Sg, Sg,  ?v")
         (vec_and_scalar_nocom:SI
 	  (match_operand:SI 1 "gcn_alu_operand" "SgB,SgA,   v")
 	  (match_operand:SI 2 "gcn_alu_operand" "SgA,SgB,vSvB")))
@@ -1731,7 +1731,7 @@
 ;; {{{ ALU: generic 64-bit
 
 (define_insn_and_split "one_cmpldi2"
-  [(set (match_operand:DI 0 "register_operand"        "=Sg,    v")
+  [(set (match_operand:DI 0 "register_operand"        "=Sg,   ?v")
 	(not:DI (match_operand:DI 1 "gcn_alu_operand" "SgA,vSvDB")))
    (clobber (match_scratch:BI 2			      "=cs,    X"))]
   ""
@@ -1753,7 +1753,7 @@
 (define_code_iterator vec_and_scalar64_com [and ior xor])
 
 (define_insn_and_split "<expander>di3"
-   [(set (match_operand:DI 0 "register_operand"  "= Sg,    v")
+   [(set (match_operand:DI 0 "register_operand"  "= Sg,   ?v")
 	 (vec_and_scalar64_com:DI
 	  (match_operand:DI 1 "gcn_alu_operand"  "%SgA,vSvDB")
 	   (match_operand:DI 2 "gcn_alu_operand" " SgC,    v")))
@@ -1781,7 +1781,7 @@
    (set_attr "length" "8")])
 
 (define_insn "<expander>di3"
-  [(set (match_operand:DI 0 "register_operand"   "=Sg, Sg,   v")
+  [(set (match_operand:DI 0 "register_operand"   "=Sg, Sg,  ?v")
 	(vec_and_scalar_nocom:DI
 	  (match_operand:DI 1 "gcn_alu_operand"  "SgC,SgA,   v")
 	  (match_operand:SI 2 "gcn_alu_operand"  "SgA,SgC,vSvC")))
-- 
2.39.5

Reply via email to