https://gcc.gnu.org/g:1bf9bb5f1c5bdaf8f96c64fbf2a081aaaa2b9b2f
commit r17-2058-g1bf9bb5f1c5bdaf8f96c64fbf2a081aaaa2b9b2f Author: Roger Sayle <[email protected]> Date: Wed Jul 1 14:26:42 2026 +0100 simplify-rtx: Simplify vec_merge of a vec_merge with a repeated operand. This patch adds an RTL optimization to simplify-rtx.cc to simplify a vec_merge of a vec_merge. A motivating example is the following code on x86_64: typedef unsigned int v4si __attribute__((vector_size(16))); v4si foo(v4si vec, int val) { vec[1] = val; vec[3] = val; return vec; } with -O2 -mavx2, GCC currently generates the following code: foo: vpinsrd $1, %edi, %xmm0, %xmm0 vpinsrd $3, %edi, %xmm0, %xmm0 ret During combine, we see: Trying 9 -> 12: 9: r103:V4SI=vec_merge(vec_duplicate(r102:SI),r106:V4SI,0x2) REG_DEAD r106:V4SI 12: r105:V4SI=vec_merge(vec_duplicate(r102:SI),r103:V4SI,0x8) REG_DEAD r103:V4SI REG_DEAD r102:SI Failed to match this instruction: (set (reg:V4SI 105 [ vec_5 ]) (vec_merge:V4SI (vec_merge:V4SI (vec_duplicate:V4SI (reg/v:SI 102 [ valD.3392 ])) (reg:V4SI 106 [ vecD.3391 ]) (const_int 2 [0x2])) (vec_duplicate:V4SI (reg/v:SI 102 [ valD.3392 ])) (const_int 7 [0x7]))) This can be simplified/canonicalized as (vec_merge (vec_merge a b m) a n) is (vec_merge a b (m|~n)). This is easy to see as the first two operands of a vec_merge may be swapped by inverting the third, i.e. (vec_merge a b n) is equivalent to (vec_merge b a ~n), and the merging one set of elements from a vector, followed by another set of elements from the same vector can be done in a single step/instruction, i.e. (vec_merge a (vec_merge a b m) n) = (vec_merge a b (m|n)). With this transformation in simplify-rtx.cc, combine now reports: Trying 3, 9 -> 12: 3: r102:SI=r107:SI REG_DEAD r107:SI 9: r103:V4SI=vec_merge(vec_duplicate(r102:SI),r106:V4SI,0x2) REG_DEAD r106:V4SI 12: r105:V4SI=vec_merge(vec_duplicate(r102:SI),r103:V4SI,0x8) REG_DEAD r103:V4SI REG_DEAD r102:SI Failed to match this instruction: (set (reg:V4SI 105 [ vec_5 ]) (vec_merge:V4SI (vec_duplicate:V4SI (reg:SI 107 [ valD.3392 ])) (reg:V4SI 106 [ vecD.3391 ]) (const_int 10 [0xa]))) Successfully matched this instruction: (set (reg:V4SI 103 [ vec_4 ]) (vec_duplicate:V4SI (reg:SI 107 [ valD.3392 ]))) Successfully matched this instruction: (set (reg:V4SI 105 [ vec_5 ]) (vec_merge:V4SI (reg:V4SI 103 [ vec_4 ]) (reg:V4SI 106 [ vecD.3391 ]) (const_int 10 [0xa]))) allowing combination of insns 3, 9 and 12 original costs 4 + 4 + 4 = 12 replacement costs 4 + 4 = 8 And for the example above, we now generate the faster: foo: vmovd %edi, %xmm2 vpbroadcastd %xmm2, %xmm1 vpblendd $10, %xmm1, %xmm0, %xmm0 ret which uses only 1 inter-unit move. The effect is even more dramatic when all the elements of a vector get set: v4si bar(v4si vec, int val) { vec[0] = val; vec[1] = val; vec[2] = val; vec[3] = val; return vec; } Before: bar: vpinsrd $0, %edi, %xmm0, %xmm0 vpinsrd $1, %edi, %xmm0, %xmm0 vpinsrd $2, %edi, %xmm0, %xmm0 vpinsrd $3, %edi, %xmm0, %xmm0 ret After: bar: vmovd %edi, %xmm1 vpbroadcastd %xmm1, %xmm0 ret 2026-07-01 Roger Sayle <[email protected]> gcc/ChangeLog * simplify-rtx.cc (simplify_context::simplify_ternary_operation) <case VEC_MERGE>: Simplify a vec_merge of a vec_merge with a repeated operand. gcc/testsuite/ChangeLog * gcc.target/i386/avx2-vpblendd128-3.c: New test case. Diff: --- gcc/simplify-rtx.cc | 31 ++++++++++++++++++++++ gcc/testsuite/gcc.target/i386/avx2-vpblendd128-3.c | 22 +++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc index e55cf0e0813c..92a2a6e954a8 100644 --- a/gcc/simplify-rtx.cc +++ b/gcc/simplify-rtx.cc @@ -7826,6 +7826,21 @@ simplify_context::simplify_ternary_operation (rtx_code code, machine_mode mode, if (!(sel & ~sel0 & mask) && !side_effects_p (XEXP (op0, 1))) return simplify_gen_ternary (code, mode, mode, XEXP (op0, 0), op1, op2); + + /* Replace (vec_merge (vec_merge a b m) a n) with + (vec_merge a b (m|~n)). */ + if (rtx_equal_p (XEXP (op0, 0), op1) + && ! side_effects_p (op1)) + return simplify_gen_ternary (code, mode, mode, + op1, XEXP (op0, 1), + GEN_INT ((sel0 | ~sel) & mask)); + /* Replace (vec_merge (vec_merge b a m) a n) with + (vec_merge b a (m&n)). */ + if (rtx_equal_p (XEXP (op0, 1), op1) + && ! side_effects_p (op1)) + return simplify_gen_ternary (code, mode, mode, + XEXP (op0, 0), op1, + GEN_INT (sel & sel0 & mask)); } } if (GET_CODE (op1) == VEC_MERGE) @@ -7840,6 +7855,22 @@ simplify_context::simplify_ternary_operation (rtx_code code, machine_mode mode, if (!(~sel & ~sel1 & mask) && !side_effects_p (XEXP (op1, 1))) return simplify_gen_ternary (code, mode, mode, op0, XEXP (op1, 0), op2); + + /* Replace (vec_merge a (vec_merge a b m) n) with + (vec_merge a b (m|n)). */ + if (rtx_equal_p (XEXP (op1, 0), op0) + && ! side_effects_p (op0)) + return simplify_gen_ternary (code, mode, mode, + op0, XEXP (op1, 1), + GEN_INT ((sel | sel1) & mask)); + + /* Replace (vec_merge a (vec_merge b a m) n) with + (vec_merge a b (~m|n)). */ + if (rtx_equal_p (XEXP (op1, 1), op0) + && ! side_effects_p (op0)) + return simplify_gen_ternary (code, mode, mode, + op0, XEXP (op1, 0), + GEN_INT ((sel | ~sel1) & mask)); } } diff --git a/gcc/testsuite/gcc.target/i386/avx2-vpblendd128-3.c b/gcc/testsuite/gcc.target/i386/avx2-vpblendd128-3.c new file mode 100644 index 000000000000..a4bd90b48d11 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx2-vpblendd128-3.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx2 -O2" } */ + +typedef unsigned int v4si __attribute__((vector_size(16))); + +v4si foo(v4si vec, int val) { + vec[0] = val; + vec[2] = val; + return vec; +} + +v4si bar(v4si vec, int val) { + vec[0] = val; + vec[1] = val; + vec[2] = val; + vec[3] = val; + return vec; +} + +/* { dg-final { scan-assembler-times "vpbroadcastd" 2 } } */ +/* { dg-final { scan-assembler-times "vpblendd\[ \\t\]+" 1 } } */ +/* { dg-final { scan-assembler-not "vpinsrd" } } */
