https://gcc.gnu.org/g:84e5bd1e7dbdd106956ed0f5d8ddee7bf7b3be7c
commit r17-915-g84e5bd1e7dbdd106956ed0f5d8ddee7bf7b3be7c Author: Roger Sayle <[email protected]> Date: Thu May 28 20:46:04 2026 +0100 x86 SSE: Improve vector increment/decrement on x86. This patch improves the code generated by the i386 backend for incrementing (adding one to) and decrementing (subtracting one from) a vector. With SSE materializing the vector -1 is more efficient than materializing the vector +1, hence x + 1 (increment) is better expressed as x - (-1), and x - 1 (decrement) is better expressed as x + (-1). Conveniently the relevant additions and subtractions are specified as a single pattern, using a plusminus iterator, in the machine description. For the four example functions: typedef char v16sqi __attribute__ ((vector_size(16))); typedef unsigned char v16uqi __attribute__ ((vector_size(16))); v16sqi sadd1(v16sqi x) { return x+1; } v16uqi uadd1(v16uqi x) { return x+1; } v16sqi saddm1(v16sqi x) { return x-1; } v16uqi uaddm1(v16uqi x) { return x-1; } GCC with -O2 -mavx2 previously generated: sadd1: vpcmpeqd %xmm1, %xmm1, %xmm1 vpabsb %xmm1, %xmm1 vpaddb %xmm1, %xmm0, %xmm0 ret uadd1: vpcmpeqd %xmm1, %xmm1, %xmm1 vpabsb %xmm1, %xmm1 vpaddb %xmm1, %xmm0, %xmm0 ret saddm1: vpcmpeqd %xmm1, %xmm1, %xmm1 vpabsb %xmm1, %xmm1 vpsubb %xmm1, %xmm0, %xmm0 ret uaddm1: vpcmpeqd %xmm1, %xmm1, %xmm1 vpaddb %xmm1, %xmm0, %xmm0 ret With this patch, we now consistently generate: sadd1: vpcmpeqd %xmm1, %xmm1, %xmm1 vpsubb %xmm1, %xmm0, %xmm0 ret uadd1: vpcmpeqd %xmm1, %xmm1, %xmm1 vpsubb %xmm1, %xmm0, %xmm0 ret saddm1: vpcmpeqd %xmm1, %xmm1, %xmm1 vpaddb %xmm1, %xmm0, %xmm0 ret uaddm1: vpcmpeqd %xmm1, %xmm1, %xmm1 vpaddb %xmm1, %xmm0, %xmm0 ret 2026-05-28 Roger Sayle <[email protected]> Hongtao Liu <[email protected]> Uros Bizjak <[email protected]> gcc/ChangeLog * config/i386/i386.md (inv_insn): New define_code_attr. * config/i386/sse.md (<plusminus><mode>3): Accept a CONST_VECTOR as the second operand. If the second operand is CONST1_RTX, canonicalize to use CONSTM1_RTX instead. (*add<mode>3_one): New define_insn_and_split to convert padd +1 to psub -1. (*sub<mode>3_one): Likewise, a new define_insn_and_split to convert psub +1 to padd -1. gcc/testsuite/ChangeLog * gcc.target/i386/avx512f-simd-1.c: Tweak test case. * gcc.target/i386/sse2-paddb-2.c: New test case. * gcc.target/i386/sse2-paddd-2.c: Likewise. * gcc.target/i386/sse2-paddw-2.c: Likewise. * gcc.target/i386/sse2-psubb-2.c: Likewise. * gcc.target/i386/sse2-psubd-2.c: Likewise. * gcc.target/i386/sse2-psubw-2.c: Likewise. Diff: --- gcc/config/i386/i386.md | 3 ++ gcc/config/i386/sse.md | 45 ++++++++++++++++++++++++-- gcc/testsuite/gcc.target/i386/avx512f-simd-1.c | 6 ++-- gcc/testsuite/gcc.target/i386/sse2-paddb-2.c | 20 ++++++++++++ gcc/testsuite/gcc.target/i386/sse2-paddd-2.c | 20 ++++++++++++ gcc/testsuite/gcc.target/i386/sse2-paddw-2.c | 20 ++++++++++++ gcc/testsuite/gcc.target/i386/sse2-psubb-2.c | 20 ++++++++++++ gcc/testsuite/gcc.target/i386/sse2-psubd-2.c | 20 ++++++++++++ gcc/testsuite/gcc.target/i386/sse2-psubw-2.c | 20 ++++++++++++ 9 files changed, 169 insertions(+), 5 deletions(-) diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 20d57c69bd3b..9b64843cec8e 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -1021,6 +1021,9 @@ [(plus "add") (ss_plus "adds") (us_plus "addus") (minus "sub") (ss_minus "subs") (us_minus "subus")]) +;; Inverse instruction base name +(define_code_attr inv_insn [(plus "sub") (minus "add")]) + (define_code_iterator multdiv [mult div]) (define_code_attr multdiv_mnemonic diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index de092f4b9ae1..39d8d196fbe9 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -16590,9 +16590,23 @@ [(set (match_operand:VI_AVX2 0 "register_operand") (plusminus:VI_AVX2 (match_operand:VI_AVX2 1 "vector_operand") - (match_operand:VI_AVX2 2 "vector_operand")))] + (match_operand:VI_AVX2 2 "vector_or_const_vector_operand")))] "TARGET_SSE2" - "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);") +{ + /* Expand vector add/sub 1 as vector sub/add -1. */ + if (rtx_equal_p (operands[2], CONST1_RTX (<MODE>mode))) + { + operands[2] = force_reg (<MODE>mode, CONSTM1_RTX (<MODE>mode)); + emit_insn (gen_<inv_insn><mode>3 (operands[0], operands[1], + operands[2])); + DONE; + } + + if (CONST_VECTOR_P (operands[2])) + operands[2] = force_reg (<MODE>mode, operands[2]); + + ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands); +}) (define_expand "cond_<insn><mode>" [(set (match_operand:VI1248_AVX512VLBW 0 "register_operand") @@ -16677,6 +16691,33 @@ (set_attr "prefix" "evex") (set_attr "mode" "<sseinsnmode>")]) +/* Split vector add 1 into vector sub -1. */ +(define_insn_and_split "*add<mode>3_one" + [(set (match_operand:VI_AVX2 0 "register_operand") + (plus:VI_AVX2 + (match_operand:VI_AVX2 1 "nonimmediate_operand") + (match_operand:VI_AVX2 2 "const1_operand")))] + "TARGET_SSE2 && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) (minus:VI_AVX2 (match_dup 1) (match_dup 3)))] +{ + operands[1] = force_reg (<MODE>mode, operands[1]); + operands[3] = force_reg (<MODE>mode, CONSTM1_RTX (<MODE>mode)); +}) + +/* Split vector sub 1 into vector add -1. */ +(define_insn_and_split "*sub<mode>3_one" + [(set (match_operand:VI_AVX2 0 "register_operand") + (minus:VI_AVX2 + (match_operand:VI_AVX2 1 "nonimmediate_operand") + (match_operand:VI_AVX2 2 "const1_operand")))] + "TARGET_SSE2 && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) (plus:VI_AVX2 (match_dup 3) (match_dup 1)))] + "operands[3] = force_reg (<MODE>mode, CONSTM1_RTX (<MODE>mode));") + (define_expand "<insn><mode>3<mask_name>" [(set (match_operand:VI12_AVX2_AVX512BW 0 "register_operand") (sat_plusminus:VI12_AVX2_AVX512BW diff --git a/gcc/testsuite/gcc.target/i386/avx512f-simd-1.c b/gcc/testsuite/gcc.target/i386/avx512f-simd-1.c index 235fb917e17f..77c5f202e2f5 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-simd-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-simd-1.c @@ -13,7 +13,7 @@ f1 (void) int i; #pragma omp simd simdlen (4) for (i = 0; i < N; ++i) - a[i] = a[i] + 1; + a[i] = a[i] + 11; } void @@ -22,7 +22,7 @@ f2 (void) int i; #pragma omp simd simdlen (8) for (i = 0; i < N; ++i) - a[i] = a[i] + 2; + a[i] = a[i] + 12; } void @@ -31,5 +31,5 @@ f3 (void) int i; #pragma omp simd simdlen (16) for (i = 0; i < N; ++i) - a[i] = a[i] + 3; + a[i] = a[i] + 13; } diff --git a/gcc/testsuite/gcc.target/i386/sse2-paddb-2.c b/gcc/testsuite/gcc.target/i386/sse2-paddb-2.c new file mode 100644 index 000000000000..f4acff29a206 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-paddb-2.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse2" } */ + +typedef char v16sqi __attribute__ ((vector_size(16))); +typedef unsigned char v16uqi __attribute__ ((vector_size(16))); + +v16sqi si,so; +v16uqi ui,uo; + +void foo() +{ + so = si - 1; +} + +void bar() +{ + uo = ui - 1; +} + +/* { dg-final { scan-assembler-times "\[ \t\]paddb\[ \t\]" 2 } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse2-paddd-2.c b/gcc/testsuite/gcc.target/i386/sse2-paddd-2.c new file mode 100644 index 000000000000..d48022cbfdae --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-paddd-2.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse2" } */ + +typedef int v4ssi __attribute__ ((vector_size(16))); +typedef unsigned int v4usi __attribute__ ((vector_size(16))); + +v4ssi si,so; +v4usi ui,uo; + +void foo() +{ + so = si - 1; +} + +void bar() +{ + uo = ui - 1; +} + +/* { dg-final { scan-assembler-times "\[ \t\]paddd\[ \t\]" 2 } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse2-paddw-2.c b/gcc/testsuite/gcc.target/i386/sse2-paddw-2.c new file mode 100644 index 000000000000..be81170cbf7f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-paddw-2.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse2" } */ + +typedef short v8shi __attribute__ ((vector_size(16))); +typedef unsigned short v8uhi __attribute__ ((vector_size(16))); + +v8shi si,so; +v8uhi ui,uo; + +void foo() +{ + so = si - 1; +} + +void bar() +{ + uo = ui - 1; +} + +/* { dg-final { scan-assembler-times "\[ \t\]paddw\[ \t\]" 2 } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse2-psubb-2.c b/gcc/testsuite/gcc.target/i386/sse2-psubb-2.c new file mode 100644 index 000000000000..e6f421eb276e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-psubb-2.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse2" } */ + +typedef char v16sqi __attribute__ ((vector_size(16))); +typedef unsigned char v16uqi __attribute__ ((vector_size(16))); + +v16sqi si,so; +v16uqi ui,uo; + +void foo() +{ + so = si + 1; +} + +void bar() +{ + uo = ui + 1; +} + +/* { dg-final { scan-assembler-times "\[ \t\]psubb\[ \t\]" 2 } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse2-psubd-2.c b/gcc/testsuite/gcc.target/i386/sse2-psubd-2.c new file mode 100644 index 000000000000..aaf7e5a5aae6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-psubd-2.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse2" } */ + +typedef int v4ssi __attribute__ ((vector_size(16))); +typedef unsigned int v4usi __attribute__ ((vector_size(16))); + +v4ssi si,so; +v4usi ui,uo; + +void foo() +{ + so = si + 1; +} + +void bar() +{ + uo = ui + 1; +} + +/* { dg-final { scan-assembler-times "\[ \t\]psubd\[ \t\]" 2 } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse2-psubw-2.c b/gcc/testsuite/gcc.target/i386/sse2-psubw-2.c new file mode 100644 index 000000000000..8c11012af9a0 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-psubw-2.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse2" } */ + +typedef short v8shi __attribute__ ((vector_size(16))); +typedef unsigned short v8uhi __attribute__ ((vector_size(16))); + +v8shi si,so; +v8uhi ui,uo; + +void foo() +{ + so = si + 1; +} + +void bar() +{ + uo = ui + 1; +} + +/* { dg-final { scan-assembler-times "\[ \t\]psubw\[ \t\]" 2 } } */
