https://gcc.gnu.org/g:6641aaa9ad5d5b3b597935b715b00e526235a852
commit r16-5444-g6641aaa9ad5d5b3b597935b715b00e526235a852 Author: Xi Ruoyao <[email protected]> Date: Sun Nov 16 19:38:02 2025 +0800 LoongArch: Micro-optimize the blend step for vec_perm<LASX> Clamp the selector using the actual number of elements 2w instead of the fixed value 0x1f. So we can simply compare the clamped selector and w to generate the mask for blending. gcc/ * config/loongarch/loongarch.cc (loongarch_expand_vec_perm_1): Clamp the selector using the twice of actual number of elements. Compare the clamped selector with the element number to get the blending mask. Diff: --- gcc/config/loongarch/loongarch.cc | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc index 9e9d6cfc75fe..4e32b23b6dbf 100644 --- a/gcc/config/loongarch/loongarch.cc +++ b/gcc/config/loongarch/loongarch.cc @@ -9095,10 +9095,13 @@ loongarch_expand_vec_perm_1 (rtx operands[]) w = GET_MODE_NUNITS (mode); /* If we are using xvshuf.*, clamp the selector to avoid unpredictable - output. */ - if (maskmode != V8SImode && maskmode != V4DImode) + output; if we need to blend two shuf results for the final result, + also clamp it so we can use xvslei to generate the bitmask for + the blending. */ + if ((maskmode != V8SImode && maskmode != V4DImode) + || !one_operand_shuffle) { - rtx t = gen_const_vec_duplicate (maskmode, GEN_INT (0x1f)); + rtx t = gen_const_vec_duplicate (maskmode, GEN_INT (2 * w - 1)); mask = expand_binop (maskmode, and_optab, mask, t, NULL_RTX, false, OPTAB_DIRECT); } @@ -9211,18 +9214,13 @@ merge_two: /* Then merge them together. The key is whether any given control element contained a bit set that indicates the second word. */ rtx xops[6]; - mask = operands[3]; - vt = GEN_INT (w); - vt = gen_const_vec_duplicate (maskmode, vt); - vt = force_reg (maskmode, vt); - mask = expand_simple_binop (maskmode, AND, mask, vt, - NULL_RTX, 0, OPTAB_DIRECT); + vt = gen_const_vec_duplicate (maskmode, GEN_INT (w - 1)); if (GET_MODE (target) != mode) target = gen_reg_rtx (mode); xops[0] = target; - xops[1] = gen_lowpart (mode, t2); - xops[2] = gen_lowpart (mode, t1); - xops[3] = gen_rtx_EQ (maskmode, mask, vt); + xops[1] = gen_lowpart (mode, t1); + xops[2] = gen_lowpart (mode, t2); + xops[3] = gen_rtx_LEU (maskmode, mask, vt); xops[4] = mask; xops[5] = vt;
