https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #1 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
GCC trunk RVV also process 11 elements in vector:

https://godbolt.org/z/q9bb8Gj4G

```
vsetivli        zero,11,e32,m1,ta,ma

```
vector codes

```
        lh      s8,0(t4)
        lh      t4,0(t1)
        ld      t1,16(sp)
        add     a3,a4,t1
        lh      a7,0(a7)
        add     t3,a4,s9
        slli    t3,t3,1
        slli    a3,a3,1
        add     t3,a2,t3
        add     a3,a2,a3
        add     a1,a5,t5
        mulw    t0,s6,s8
        lh      t5,0(t3)
        lh      t3,0(a3)
        add     a3,a5,t1
        add     a0,a5,s9
        slli    a0,a0,1
        add     a0,a2,a0
        lh      a0,0(a0)
        slli    a1,a1,1
        add     a1,a2,a1
        mulw    t1,s5,a7
        lh      a1,0(a1)
        slli    a3,a3,1
        add     a3,a2,a3
        lh      a3,0(a3)
        add     a4,s7,a4
        slli    a4,a4,1
        add     a4,t6,a4
        add     a5,s7,a5
        slli    a5,a5,1
        mulw    s8,s5,s8
        subw    t0,t0,t1
        add     a5,t6,a5
        addiw   a6,a6,24
        mulw    a7,s6,a7
        mulw    t1,s4,t5
        addw    a7,a7,s8
        mulw    t5,s3,t5
        mulw    s8,s3,a0
        mulw    a0,s4,a0
        subw    t1,t1,s8
        addw    t1,t0,t1
        addw    a0,a0,t5
        addw    a0,a7,a0
        mulw    t0,s2,t4
        mulw    a7,s1,a1
        mulw    t4,s1,t4
        subw    t0,t0,a7
        addw    t0,t0,t1
        mulw    a1,s2,a1
        addw    a1,a1,t4
        mulw    a7,s0,t3
        addw    a1,a1,a0
        mulw    a0,t2,a3
        subw    a7,a7,a0
        addw    a7,a7,t0
        sraiw   a7,a7,15
        sh      a7,0(a4)
        mulw    t3,t2,t3
        mulw    a4,s0,a3
        addw    a4,a4,t3
        addw    a4,a4,a1
        sraiw   a4,a4,15
        sh      a4,0(a5)
        bne     a6,s11,.L4
        ld      a0,56(sp)
        addiw   a5,a0,1
        slli    a0,a5,48
        ld      t4,72(sp)
        ld      t1,64(sp)
        srli    a0,a0,48
        ld      a5,80(sp)
        ld      a7,48(sp)
        addw    t1,t4,t1
        addw    s11,t4,a6
        bne     a0,a5,.L6

Same issue as ARM SVE, I think the tail scalar operations can be folded into
vector operations.

Reply via email to