https://gcc.gnu.org/g:1bf9bb5f1c5bdaf8f96c64fbf2a081aaaa2b9b2f

commit r17-2058-g1bf9bb5f1c5bdaf8f96c64fbf2a081aaaa2b9b2f
Author: Roger Sayle <[email protected]>
Date:   Wed Jul 1 14:26:42 2026 +0100

    simplify-rtx: Simplify vec_merge of a vec_merge with a repeated operand.
    
    This patch adds an RTL optimization to simplify-rtx.cc to simplify a
    vec_merge of a vec_merge.
    
    A motivating example is the following code on x86_64:
    
    typedef unsigned int v4si __attribute__((vector_size(16)));
    
    v4si foo(v4si vec, int val) {
        vec[1] = val;
        vec[3] = val;
        return vec;
    }
    
    with -O2 -mavx2, GCC currently generates the following code:
    
    foo:    vpinsrd $1, %edi, %xmm0, %xmm0
            vpinsrd $3, %edi, %xmm0, %xmm0
            ret
    
    During combine, we see:
    
    Trying 9 -> 12:
        9: r103:V4SI=vec_merge(vec_duplicate(r102:SI),r106:V4SI,0x2)
          REG_DEAD r106:V4SI
       12: r105:V4SI=vec_merge(vec_duplicate(r102:SI),r103:V4SI,0x8)
          REG_DEAD r103:V4SI
          REG_DEAD r102:SI
    Failed to match this instruction:
    (set (reg:V4SI 105 [ vec_5 ])
        (vec_merge:V4SI (vec_merge:V4SI (vec_duplicate:V4SI (reg/v:SI 102 [ 
valD.3392 ]))
                (reg:V4SI 106 [ vecD.3391 ])
                (const_int 2 [0x2]))
            (vec_duplicate:V4SI (reg/v:SI 102 [ valD.3392 ]))
            (const_int 7 [0x7])))
    
    This can be simplified/canonicalized as (vec_merge (vec_merge a b m) a n)
    is (vec_merge a b (m|~n)).  This is easy to see as the first two operands
    of a vec_merge may be swapped by inverting the third, i.e.
    (vec_merge a b n) is equivalent to (vec_merge b a ~n), and the merging
    one set of elements from a vector, followed by another set of elements
    from the same vector can be done in a single step/instruction, i.e.
    (vec_merge a (vec_merge a b m) n) = (vec_merge a b (m|n)).
    
    With this transformation in simplify-rtx.cc, combine now reports:
    
    Trying 3, 9 -> 12:
        3: r102:SI=r107:SI
          REG_DEAD r107:SI
        9: r103:V4SI=vec_merge(vec_duplicate(r102:SI),r106:V4SI,0x2)
          REG_DEAD r106:V4SI
       12: r105:V4SI=vec_merge(vec_duplicate(r102:SI),r103:V4SI,0x8)
          REG_DEAD r103:V4SI
          REG_DEAD r102:SI
    Failed to match this instruction:
    (set (reg:V4SI 105 [ vec_5 ])
        (vec_merge:V4SI (vec_duplicate:V4SI (reg:SI 107 [ valD.3392 ]))
            (reg:V4SI 106 [ vecD.3391 ])
            (const_int 10 [0xa])))
    Successfully matched this instruction:
    (set (reg:V4SI 103 [ vec_4 ])
        (vec_duplicate:V4SI (reg:SI 107 [ valD.3392 ])))
    Successfully matched this instruction:
    (set (reg:V4SI 105 [ vec_5 ])
        (vec_merge:V4SI (reg:V4SI 103 [ vec_4 ])
            (reg:V4SI 106 [ vecD.3391 ])
            (const_int 10 [0xa])))
    allowing combination of insns 3, 9 and 12
    original costs 4 + 4 + 4 = 12
    replacement costs 4 + 4 = 8
    
    And for the example above, we now generate the faster:
    
    foo:    vmovd   %edi, %xmm2
            vpbroadcastd    %xmm2, %xmm1
            vpblendd        $10, %xmm1, %xmm0, %xmm0
            ret
    
    which uses only 1 inter-unit move.
    
    The effect is even more dramatic when all the elements of a vector
    get set:
    
    v4si bar(v4si vec, int val) {
        vec[0] = val;
        vec[1] = val;
        vec[2] = val;
        vec[3] = val;
        return vec;
    }
    
    Before:
    bar:    vpinsrd $0, %edi, %xmm0, %xmm0
            vpinsrd $1, %edi, %xmm0, %xmm0
            vpinsrd $2, %edi, %xmm0, %xmm0
            vpinsrd $3, %edi, %xmm0, %xmm0
            ret
    
    After:
    bar:    vmovd   %edi, %xmm1
            vpbroadcastd    %xmm1, %xmm0
            ret
    
    2026-07-01  Roger Sayle  <[email protected]>
    
    gcc/ChangeLog
            * simplify-rtx.cc (simplify_context::simplify_ternary_operation)
            <case VEC_MERGE>: Simplify a vec_merge of a vec_merge with a
            repeated operand.
    
    gcc/testsuite/ChangeLog
            * gcc.target/i386/avx2-vpblendd128-3.c: New test case.

Diff:
---
 gcc/simplify-rtx.cc                                | 31 ++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/avx2-vpblendd128-3.c | 22 +++++++++++++++
 2 files changed, 53 insertions(+)

diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index e55cf0e0813c..92a2a6e954a8 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -7826,6 +7826,21 @@ simplify_context::simplify_ternary_operation (rtx_code 
code, machine_mode mode,
                  if (!(sel & ~sel0 & mask) && !side_effects_p (XEXP (op0, 1)))
                    return simplify_gen_ternary (code, mode, mode,
                                                 XEXP (op0, 0), op1, op2);
+
+                 /* Replace (vec_merge (vec_merge a b m) a n) with
+                    (vec_merge a b (m|~n)).  */
+                 if (rtx_equal_p (XEXP (op0, 0), op1)
+                     && ! side_effects_p (op1))
+                   return simplify_gen_ternary (code, mode, mode,
+                                                op1, XEXP (op0, 1),
+                                                GEN_INT ((sel0 | ~sel) & 
mask));
+                 /* Replace (vec_merge (vec_merge b a m) a n) with
+                    (vec_merge b a (m&n)).  */
+                 if (rtx_equal_p (XEXP (op0, 1), op1)
+                     && ! side_effects_p (op1))
+                   return simplify_gen_ternary (code, mode, mode,
+                                                XEXP (op0, 0), op1,
+                                                GEN_INT (sel & sel0 & mask));
                }
            }
          if (GET_CODE (op1) == VEC_MERGE)
@@ -7840,6 +7855,22 @@ simplify_context::simplify_ternary_operation (rtx_code 
code, machine_mode mode,
                  if (!(~sel & ~sel1 & mask) && !side_effects_p (XEXP (op1, 1)))
                    return simplify_gen_ternary (code, mode, mode,
                                                 op0, XEXP (op1, 0), op2);
+
+                 /* Replace (vec_merge a (vec_merge a b m) n) with
+                    (vec_merge a b (m|n)).  */
+                 if (rtx_equal_p (XEXP (op1, 0), op0)
+                     && ! side_effects_p (op0))
+                   return simplify_gen_ternary (code, mode, mode,
+                                                op0, XEXP (op1, 1),
+                                                GEN_INT ((sel | sel1) & mask));
+
+                 /* Replace (vec_merge a (vec_merge b a m) n) with
+                    (vec_merge a b (~m|n)).  */
+                 if (rtx_equal_p (XEXP (op1, 1), op0)
+                     && ! side_effects_p (op0))
+                   return simplify_gen_ternary (code, mode, mode,
+                                                op0, XEXP (op1, 0),
+                                                GEN_INT ((sel | ~sel1) & 
mask));
                }
            }
 
diff --git a/gcc/testsuite/gcc.target/i386/avx2-vpblendd128-3.c 
b/gcc/testsuite/gcc.target/i386/avx2-vpblendd128-3.c
new file mode 100644
index 000000000000..a4bd90b48d11
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-vpblendd128-3.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O2" } */
+
+typedef unsigned int v4si __attribute__((vector_size(16)));
+
+v4si foo(v4si vec, int val) {
+    vec[0] = val;
+    vec[2] = val;
+    return vec;
+}
+
+v4si bar(v4si vec, int val) {
+    vec[0] = val;
+    vec[1] = val;
+    vec[2] = val;
+    vec[3] = val;
+    return vec;
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastd" 2 } } */
+/* { dg-final { scan-assembler-times "vpblendd\[ \\t\]+" 1 } } */
+/* { dg-final { scan-assembler-not "vpinsrd" } } */

Reply via email to