[PATCH] LoongArch: Fix wrong code generated by TARGET_VECTORIZE_VEC_PERM_CONST [PR121064]

Xi Ruoyao Mon, 14 Jul 2025 21:00:48 -0700

When TARGET_VECTORIZE_VEC_PERM_CONST is called, target may be the
same pseudo as op0 and/or op1.  Loading the selector into target
would clobber the input, producing wrong code like


    vld     $vr0, $t0
    vshuf.w $vr0, $vr0, $vr1

So don't load the selector into d->target, use a new pseudo to hold the
selector instead.  The reload pass will load the pseudo for selector and
the pseudo for target into the same hard register (following our
constraint '0' on the shuf instructions) anyway.

gcc/ChangeLog:

        PR target/121064
        * config/loongarch/lsx.md (lsx_vshuf_<lsxfmt_f>): Add '@' to
        generate a mode-aware helper.  Use <VIMODE> as the mode of the
        operand 1 (selector).
        * config/loongarch/lasx.md (lasx_xvshuf_<lasxfmt_f>): Likewise.
        * config/loongarch/loongarch.cc
        (loongarch_try_expand_lsx_vshuf_const): Create a new pseudo for
        the selector.  Use the mode-aware helper to simplify the code.
        (loongarch_expand_vec_perm_const): Likewise.

gcc/testsuite/ChangeLog:

        PR target/121064
        * gcc.target/loongarch/pr121064.c: New test.
---

Bootstrapped and regtested on loongarch64-linux-gnu.  Ok for trunk and
14/15?

 gcc/config/loongarch/lasx.md                  |   4 +-
 gcc/config/loongarch/loongarch.cc             | 126 +++++-------------
 gcc/config/loongarch/lsx.md                   |   4 +-
 gcc/testsuite/gcc.target/loongarch/pr121064.c |  38 ++++++
 4 files changed, 73 insertions(+), 99 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/loongarch/pr121064.c

diff --git a/gcc/config/loongarch/lasx.md b/gcc/config/loongarch/lasx.md
index 43e3ab0026a..3d71f30a54b 100644
--- a/gcc/config/loongarch/lasx.md
+++ b/gcc/config/loongarch/lasx.md
@@ -2060,9 +2060,9 @@ (define_insn "lasx_xvssub_u_<lasxfmt_u>"
   [(set_attr "type" "simd_int_arith")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "lasx_xvshuf_<lasxfmt_f>"
+(define_insn "@lasx_xvshuf_<lasxfmt_f>"
   [(set (match_operand:LASX_DWH 0 "register_operand" "=f")
-       (unspec:LASX_DWH [(match_operand:LASX_DWH 1 "register_operand" "0")
+       (unspec:LASX_DWH [(match_operand:<VIMODE> 1 "register_operand" "0")
                          (match_operand:LASX_DWH 2 "register_operand" "f")
                          (match_operand:LASX_DWH 3 "register_operand" "f")]
                        UNSPEC_LASX_XVSHUF))]
diff --git a/gcc/config/loongarch/loongarch.cc 
b/gcc/config/loongarch/loongarch.cc
index 0129108d0d3..036e6859d31 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -8388,7 +8388,7 @@ static bool
 loongarch_try_expand_lsx_vshuf_const (struct expand_vec_perm_d *d)
 {
   int i;
-  rtx target, op0, op1, sel, tmp;
+  rtx target, op0, op1;
   rtx rperm[MAX_VECT_LEN];
 
   if (GET_MODE_SIZE (d->vmode) == 16)
@@ -8407,47 +8407,23 @@ loongarch_try_expand_lsx_vshuf_const (struct 
expand_vec_perm_d *d)
       for (i = 0; i < d->nelt; i += 1)
          rperm[i] = GEN_INT (d->perm[i]);
 
-      if (d->vmode == E_V2DFmode)
-       {
-         sel = gen_rtx_CONST_VECTOR (E_V2DImode, gen_rtvec_v (d->nelt, rperm));
-         tmp = simplify_gen_subreg (E_V2DImode, d->target, d->vmode, 0);
-         emit_move_insn (tmp, sel);
-       }
-      else if (d->vmode == E_V4SFmode)
-       {
-         sel = gen_rtx_CONST_VECTOR (E_V4SImode, gen_rtvec_v (d->nelt, rperm));
-         tmp = simplify_gen_subreg (E_V4SImode, d->target, d->vmode, 0);
-         emit_move_insn (tmp, sel);
-       }
+      machine_mode sel_mode = related_int_vector_mode (d->vmode)
+       .require ();
+      rtvec sel_v = gen_rtvec_v (d->nelt, rperm);
+
+      /* Despite vshuf.* (except vshuf.b) needs sel == target, we cannot
+        load sel into target right now: here we are dealing with
+        pseudo regs, and target may be the same pseudo as one of op0
+        or op1.  Then we'd clobber the input.  Instead, we use a new
+        pseudo reg here.  The reload pass will look at the constraint
+        of vshuf.* and move sel into target first if needed.  */
+      rtx sel = force_reg (sel_mode,
+                          gen_rtx_CONST_VECTOR (sel_mode, sel_v));
+
+      if (d->vmode == E_V16QImode)
+       emit_insn (gen_lsx_vshuf_b (target, op1, op0, sel));
       else
-       {
-         sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt, rperm));
-         emit_move_insn (d->target, sel);
-       }
-
-      switch (d->vmode)
-       {
-       case E_V2DFmode:
-         emit_insn (gen_lsx_vshuf_d_f (target, target, op1, op0));
-         break;
-       case E_V2DImode:
-         emit_insn (gen_lsx_vshuf_d (target, target, op1, op0));
-         break;
-       case E_V4SFmode:
-         emit_insn (gen_lsx_vshuf_w_f (target, target, op1, op0));
-         break;
-       case E_V4SImode:
-         emit_insn (gen_lsx_vshuf_w (target, target, op1, op0));
-         break;
-       case E_V8HImode:
-         emit_insn (gen_lsx_vshuf_h (target, target, op1, op0));
-         break;
-       case E_V16QImode:
-         emit_insn (gen_lsx_vshuf_b (target, op1, op0, target));
-         break;
-       default:
-         break;
-       }
+       emit_insn (gen_lsx_vshuf (d->vmode, target, sel, op1, op0));
 
       return true;
     }
@@ -9443,7 +9419,7 @@ loongarch_expand_vec_perm_const (struct expand_vec_perm_d 
*d)
   bool flag = false;
   unsigned int i;
   unsigned char idx;
-  rtx target, op0, op1, sel, tmp;
+  rtx target, op0, op1;
   rtx rperm[MAX_VECT_LEN];
   unsigned int remapped[MAX_VECT_LEN];
   unsigned char perm2[MAX_VECT_LEN];
@@ -9623,63 +9599,23 @@ loongarch_expand_vec_perm_const (struct 
expand_vec_perm_d *d)
 expand_perm_const_end:
       if (flag)
        {
-         /* Copy selector vector from memory to vector register for later insn
-            gen function.
-            If vector's element in floating point value, we cannot fit
-            selector argument into insn gen function directly, because of the
-            insn template definition.  As a solution, generate a integral mode
-            subreg of target, then copy selector vector (that is in integral
-            mode) to this subreg.  */
-         switch (d->vmode)
-           {
-           case E_V4DFmode:
-             sel = gen_rtx_CONST_VECTOR (E_V4DImode, gen_rtvec_v (d->nelt,
-                                                                  rperm));
-             tmp = simplify_gen_subreg (E_V4DImode, d->target, d->vmode, 0);
-             emit_move_insn (tmp, sel);
-             break;
-           case E_V8SFmode:
-             sel = gen_rtx_CONST_VECTOR (E_V8SImode, gen_rtvec_v (d->nelt,
-                                                                  rperm));
-             tmp = simplify_gen_subreg (E_V8SImode, d->target, d->vmode, 0);
-             emit_move_insn (tmp, sel);
-             break;
-           default:
-             sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt,
-                                                                rperm));
-             emit_move_insn (d->target, sel);
-             break;
-           }
-
          target = d->target;
          op0 = d->op0;
          op1 = d->one_vector_p ? d->op0 : d->op1;
 
-         /* We FINALLY can generate xvshuf.* insn.  */
-         switch (d->vmode)
-           {
-           case E_V4DFmode:
-             emit_insn (gen_lasx_xvshuf_d_f (target, target, op1, op0));
-             break;
-           case E_V4DImode:
-             emit_insn (gen_lasx_xvshuf_d (target, target, op1, op0));
-             break;
-           case E_V8SFmode:
-             emit_insn (gen_lasx_xvshuf_w_f (target, target, op1, op0));
-             break;
-           case E_V8SImode:
-             emit_insn (gen_lasx_xvshuf_w (target, target, op1, op0));
-             break;
-           case E_V16HImode:
-             emit_insn (gen_lasx_xvshuf_h (target, target, op1, op0));
-             break;
-           case E_V32QImode:
-             emit_insn (gen_lasx_xvshuf_b (target, op1, op0, target));
-             break;
-           default:
-             gcc_unreachable ();
-             break;
-           }
+         machine_mode sel_mode = related_int_vector_mode (d->vmode)
+           .require ();
+         rtvec sel_v = gen_rtvec_v (d->nelt, rperm);
+
+         /* See the comment in loongarch_expand_lsx_shuffle for why
+            we don't simply use a SUBREG to pun target.  */
+         rtx sel = force_reg (sel_mode,
+                              gen_rtx_CONST_VECTOR (sel_mode, sel_v));
+
+         if (d->vmode == E_V32QImode)
+           emit_insn (gen_lasx_xvshuf_b (target, op1, op0, sel));
+         else
+           emit_insn (gen_lasx_xvshuf (d->vmode, target, sel, op1, op0));
 
          return true;
        }
diff --git a/gcc/config/loongarch/lsx.md b/gcc/config/loongarch/lsx.md
index 407c86870df..fb0236ba0f1 100644
--- a/gcc/config/loongarch/lsx.md
+++ b/gcc/config/loongarch/lsx.md
@@ -535,9 +535,9 @@ (define_expand "vec_perm<mode>"
   DONE;
 })
 
-(define_insn "lsx_vshuf_<lsxfmt_f>"
+(define_insn "@lsx_vshuf_<lsxfmt_f>"
   [(set (match_operand:LSX_DWH 0 "register_operand" "=f")
-       (unspec:LSX_DWH [(match_operand:LSX_DWH 1 "register_operand" "0")
+       (unspec:LSX_DWH [(match_operand:<VIMODE> 1 "register_operand" "0")
                         (match_operand:LSX_DWH 2 "register_operand" "f")
                         (match_operand:LSX_DWH 3 "register_operand" "f")]
                        UNSPEC_LSX_VSHUF))]
diff --git a/gcc/testsuite/gcc.target/loongarch/pr121064.c 
b/gcc/testsuite/gcc.target/loongarch/pr121064.c
new file mode 100644
index 00000000000..a466c7abc70
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/pr121064.c
@@ -0,0 +1,38 @@
+/* { dg-require-effective-target loongarch_sx_hw } */
+/* { dg-do run } */
+/* { dg-options "-march=loongarch64 -mfpu=64 -mlsx -O3" } */
+
+typedef __INT32_TYPE__ int32_t;
+typedef unsigned __INT32_TYPE__ uint32_t;
+
+__attribute__ ((noipa)) static int32_t
+long_filter_ehigh_3830_1 (int32_t *buffer, int length)
+{
+  int i, j;
+  int32_t dotprod = 0;
+  int32_t delay[4] = { 0 };
+  uint32_t coeffs[4] = { 0 };
+
+  for (i = 0; i < length; i++)
+    {
+      dotprod = 0;
+      for (j = 3; j >= 0; j--)
+        {
+          dotprod += delay[j] * coeffs[j];
+          coeffs[j] += ((delay[j] >> 31) | 1);
+        }
+      for (j = 3; j > 0; j--)
+        delay[j] = delay[j - 1];
+      delay[0] = buffer[i];
+    }
+
+  return dotprod;
+}
+
+int
+main ()
+{
+  int32_t buffer[] = { -1, 1 };
+  if (long_filter_ehigh_3830_1 (buffer, 2) != -1)
+    __builtin_trap ();
+}
-- 
2.50.0

[PATCH] LoongArch: Fix wrong code generated by TARGET_VECTORIZE_VEC_PERM_CONST [PR121064]

Reply via email to