This patch is depending on middle-end support:
https://gcc.gnu.org/pipermail/gcc-patches/2023-August/627305.html
This patch allow us auto-vectorize this following case:
#define TEST_LOOP(NAME, OUTTYPE, INTYPE, MASKTYPE) \
void __attribute__ ((noinline, noclone))
\
NAME##_8 (OUTTYPE *__restrict dest, INTYPE *__restrict src,
\
MASKTYPE *__restrict cond, intptr_t n) \
{
\
for (intptr_t i = 0; i < n; ++i)
\
if (cond[i])
\
dest[i] = (src[i * 8] + src[i * 8 + 1] + src[i * 8 + 2] \
+ src[i * 8 + 3] + src[i * 8 + 4] + src[i * 8 + 5] \
+ src[i * 8 + 6] + src[i * 8 + 7]); \
}
#define TEST2(NAME, OUTTYPE, INTYPE) \
TEST_LOOP (NAME##_f32, OUTTYPE, INTYPE, int32_t)
\
#define TEST1(NAME, OUTTYPE) \
TEST2 (NAME##_i32, OUTTYPE, int32_t)
\
#define TEST(NAME) \
TEST1 (NAME##_i32, int32_t)
\
TEST (test)
ASM:
test_i32_i32_f32_8:
ble a3,zero,.L5
.L3:
vsetvli a4,a3,e8,mf4,ta,ma
vle32.v v0,0(a2)
vsetvli a5,zero,e32,m1,ta,ma
vmsne.vi v0,v0,0
vsetvli zero,a4,e32,m1,ta,ma
vlseg8e32.v v8,(a1),v0.t
vsetvli a5,zero,e32,m1,ta,ma
slli a6,a4,2
vadd.vv v1,v9,v8
slli a7,a4,5
vadd.vv v1,v1,v10
sub a3,a3,a4
vadd.vv v1,v1,v11
vadd.vv v1,v1,v12
vadd.vv v1,v1,v13
vadd.vv v1,v1,v14
vadd.vv v1,v1,v15
vsetvli zero,a4,e32,m1,ta,ma
vse32.v v1,0(a0),v0.t
add a2,a2,a6
add a1,a1,a7
add a0,a0,a6
bne a3,zero,.L3
.L5:
ret
gcc/ChangeLog:
* config/riscv/autovec.md (vec_mask_len_load_lanes<mode><vsingle>):
New pattern.
(vec_mask_len_store_lanes<mode><vsingle>): Ditto.
(<optab><v_quad_trunc><mode>2): Fix pattern for ICE.
(<optab><v_oct_trunc><mode>2): Ditto.
* config/riscv/riscv-protos.h (expand_lanes_load_store): New function.
* config/riscv/riscv-v.cc (get_mask_mode): Add tuple mode mask mode.
(expand_lanes_load_store): New function.
* config/riscv/vector-iterators.md: New iterator.