This patch improves the code generated by the i386 backend for incrementing
(adding one to) and decrementing (subtracting one from) a vector.  With SSE
materializing the vector -1 is more efficient than materializing the
vector +1, hence x + 1 (increment) is better expressed as x - (-1), and
x - 1 (decrement) is better expressed as x + (-1).  Conveniently the
relevant additions and subtractions are specified as a single pattern,
using a plusminus iterator, in the machine description.

For the four example functions:

typedef char v16sqi __attribute__ ((vector_size(16)));
typedef unsigned char v16uqi __attribute__ ((vector_size(16)));

v16sqi sadd1(v16sqi x) { return x+1; }
v16uqi uadd1(v16uqi x) { return x+1; }
v16sqi saddm1(v16sqi x) { return x-1; }
v16uqi uaddm1(v16uqi x) { return x-1; }

GCC with -O2 -mavx2 previously generated:

sadd1:  vpcmpeqd        %xmm1, %xmm1, %xmm1
        vpabsb  %xmm1, %xmm1
        vpaddb  %xmm1, %xmm0, %xmm0
        ret

uadd1:  vpcmpeqd        %xmm1, %xmm1, %xmm1
        vpabsb  %xmm1, %xmm1
        vpaddb  %xmm1, %xmm0, %xmm0
        ret

saddm1: vpcmpeqd        %xmm1, %xmm1, %xmm1
        vpabsb  %xmm1, %xmm1
        vpsubb  %xmm1, %xmm0, %xmm0
        ret

uaddm1: vpcmpeqd        %xmm1, %xmm1, %xmm1
        vpaddb  %xmm1, %xmm0, %xmm0
        ret

With this patch, we now consistently generate:

sadd1:  vpcmpeqd        %xmm1, %xmm1, %xmm1
        vpsubb  %xmm1, %xmm0, %xmm0
        ret

uadd1:  vpcmpeqd        %xmm1, %xmm1, %xmm1
        vpsubb  %xmm1, %xmm0, %xmm0
        ret

saddm1: vpcmpeqd        %xmm1, %xmm1, %xmm1
        vpaddb  %xmm1, %xmm0, %xmm0
        ret

uaddm1: vpcmpeqd        %xmm1, %xmm1, %xmm1
        vpaddb  %xmm1, %xmm0, %xmm0
        ret


This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}
with no new failures.  Ok for mainline?


2026-05-14  Roger Sayle  <[email protected]>

gcc/ChangeLog
        * config/i386/sse.md (<plusminus><mode>3): Accept a CONST_VECTOR
        as the second operand.  If the second operand is CONST1_RTX,
        canonicalize to use CONSTM1_RTX instead.

gcc/testsuite/ChangeLog
        * gcc.target/i386/avx512f-simd-1.c: Tweak test case.
        * gcc.target/i386/sse2-paddb-2.c: New test case.
        * gcc.target/i386/sse2-paddd-2.c: Likewise.
        * gcc.target/i386/sse2-paddw-2.c: Likewise.
        * gcc.target/i386/sse2-psubb-2.c: Likewise.
        * gcc.target/i386/sse2-psubd-2.c: Likewise.
        * gcc.target/i386/sse2-psubw-2.c: Likewise.


Thanks in advance,
Roger
--

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 51d1e9b455a..86f153a9539 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -16590,9 +16590,25 @@
   [(set (match_operand:VI_AVX2 0 "register_operand")
        (plusminus:VI_AVX2
          (match_operand:VI_AVX2 1 "vector_operand")
-         (match_operand:VI_AVX2 2 "vector_operand")))]
+         (match_operand:VI_AVX2 2 "vector_or_const_vector_operand")))]
   "TARGET_SSE2"
-  "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
+{
+  /* Expand vector add/sub 1 as vector sub/add -1.  */
+  if (rtx_equal_p (operands[2], CONST1_RTX (<MODE>mode)))
+    {
+      rtx insn;
+      operands[2] = force_reg (<MODE>mode, CONSTM1_RTX (<MODE>mode));
+      if (<CODE> == PLUS)
+       insn = gen_sub<mode>3 (operands[0], operands[1], operands[2]);
+      else
+       insn = gen_add<mode>3 (operands[0], operands[1], operands[2]);
+      emit_insn (insn);
+      DONE;
+    }
+  if (CONST_VECTOR_P (operands[2]))
+    operands[2] = force_reg (<MODE>mode, operands[2]);
+  ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);
+})
 
 (define_expand "cond_<insn><mode>"
   [(set (match_operand:VI1248_AVX512VLBW 0 "register_operand")
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-simd-1.c 
b/gcc/testsuite/gcc.target/i386/avx512f-simd-1.c
index 235fb917e17..77c5f202e2f 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-simd-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-simd-1.c
@@ -13,7 +13,7 @@ f1 (void)
   int i;
   #pragma omp simd simdlen (4)
   for (i = 0; i < N; ++i)
-    a[i] = a[i] + 1;
+    a[i] = a[i] + 11;
 }
 
 void
@@ -22,7 +22,7 @@ f2 (void)
   int i;
   #pragma omp simd simdlen (8)
   for (i = 0; i < N; ++i)
-    a[i] = a[i] + 2;
+    a[i] = a[i] + 12;
 }
 
 void
@@ -31,5 +31,5 @@ f3 (void)
   int i;
   #pragma omp simd simdlen (16)
   for (i = 0; i < N; ++i)
-    a[i] = a[i] + 3;
+    a[i] = a[i] + 13;
 }
diff --git a/gcc/testsuite/gcc.target/i386/sse2-paddb-2.c 
b/gcc/testsuite/gcc.target/i386/sse2-paddb-2.c
new file mode 100644
index 00000000000..f4acff29a20
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-paddb-2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+
+typedef char v16sqi __attribute__ ((vector_size(16)));
+typedef unsigned char v16uqi __attribute__ ((vector_size(16)));
+
+v16sqi si,so;
+v16uqi ui,uo;
+
+void foo()
+{
+  so = si - 1;
+}
+
+void bar()
+{
+  uo = ui - 1;
+}
+
+/* { dg-final { scan-assembler-times "\[ \t\]paddb\[ \t\]" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-paddd-2.c 
b/gcc/testsuite/gcc.target/i386/sse2-paddd-2.c
new file mode 100644
index 00000000000..c7d31299c8b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-paddd-2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+
+typedef short v4ssi __attribute__ ((vector_size(16)));
+typedef unsigned short v4usi __attribute__ ((vector_size(16)));
+
+v4ssi si,so;
+v4usi ui,uo;
+
+void foo()
+{
+  so = si - 1;
+}
+
+void bar()
+{
+  uo = ui - 1;
+}
+
+/* { dg-final { scan-assembler-times "\[ \t\]paddd\[ \t\]" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-paddw-2.c 
b/gcc/testsuite/gcc.target/i386/sse2-paddw-2.c
new file mode 100644
index 00000000000..be81170cbf7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-paddw-2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+
+typedef short v8shi __attribute__ ((vector_size(16)));
+typedef unsigned short v8uhi __attribute__ ((vector_size(16)));
+
+v8shi si,so;
+v8uhi ui,uo;
+
+void foo()
+{
+  so = si - 1;
+}
+
+void bar()
+{
+  uo = ui - 1;
+}
+
+/* { dg-final { scan-assembler-times "\[ \t\]paddw\[ \t\]" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-psubb-2.c 
b/gcc/testsuite/gcc.target/i386/sse2-psubb-2.c
new file mode 100644
index 00000000000..e6f421eb276
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-psubb-2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+
+typedef char v16sqi __attribute__ ((vector_size(16)));
+typedef unsigned char v16uqi __attribute__ ((vector_size(16)));
+
+v16sqi si,so;
+v16uqi ui,uo;
+
+void foo()
+{
+  so = si + 1;
+}
+
+void bar()
+{
+  uo = ui + 1;
+}
+
+/* { dg-final { scan-assembler-times "\[ \t\]psubb\[ \t\]" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-psubd-2.c 
b/gcc/testsuite/gcc.target/i386/sse2-psubd-2.c
new file mode 100644
index 00000000000..9d85f673bd0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-psubd-2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+
+typedef short v4ssi __attribute__ ((vector_size(16)));
+typedef unsigned short v4usi __attribute__ ((vector_size(16)));
+
+v4ssi si,so;
+v4usi ui,uo;
+
+void foo()
+{
+  so = si + 1;
+}
+
+void bar()
+{
+  uo = ui + 1;
+}
+
+/* { dg-final { scan-assembler-times "\[ \t\]psubd\[ \t\]" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-psubw-2.c 
b/gcc/testsuite/gcc.target/i386/sse2-psubw-2.c
new file mode 100644
index 00000000000..8c11012af9a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-psubw-2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+
+typedef short v8shi __attribute__ ((vector_size(16)));
+typedef unsigned short v8uhi __attribute__ ((vector_size(16)));
+
+v8shi si,so;
+v8uhi ui,uo;
+
+void foo()
+{
+  so = si + 1;
+}
+
+void bar()
+{
+  uo = ui + 1;
+}
+
+/* { dg-final { scan-assembler-times "\[ \t\]psubw\[ \t\]" 2 } } */

Reply via email to