On Tue, Sep 29, 2020 at 12:55 PM 夏 晋 via Gcc <gcc@gcc.gnu.org> wrote:
>
> Hi everyone,
> I tried to set the "vlen" after the add & multi, as shown in the following 
> code:
> ➜
> vf32 x3,x4;
> void foo1(float16_t* input, float16_t* output, int vlen){
>     vf32 add = x3 + x4;
>     vf32 mul = x3 * x4;
>     __builtin_riscv_vlen(vlen);  //<----
>     storevf(&output[0], add);
>     storevf(&output[4], mul);
> }
> but after compilation, the "vlen" is reordered:
> ➜
> foo1:
>     lui     a5,%hi(.LANCHOR0)
>     addi    a5,a5,%lo(.LANCHOR0)
>     addi    a4,a5,64
>     vfld    v0,a5
>     vfld    v1,a4
>     csrw    vlen,a2  //<----
>     vfadd   v2,v0,v1
>     addi    a5,a1,8
>     vfmul   v0,v0,v1
>     vfst    v2,a1
>     vfst    v0,a5
>     ret
> And I've tried to add some barrier code shown as the following:
> ➜
> #define barrier() __asm__ __volatile__("": : :"memory")
> vf32 x3,x4;
> void foo1(float16_t* input, float16_t* output, int vlen){
>     vf32 add = x3 + x4;
>     vf32 mul = x3 * x4;
>     barrier();
>     __builtin_riscv_vlen(vlen);
>     barrier();
>     storevf(&output[0], add);
>     storevf(&output[4], mul);
> }
> ➜
> vf32 x3,x4;
> void foo1(float16_t* input, float16_t* output, int vlen){
>     vf32 add = x3 + x4;
>     vf32 mul = x3 * x4;
>     __asm__ __volatile__ ("csrw\tvlen,%0" : : "rJ"(vlen) : "memory");
>     storevf(&output[0], add);
>     storevf(&output[4], mul);
> }
> Both methods compiled out the same false assembly.
> =======
> But if I tried the code like: (add & multi are using different operands)
> ➜
> vf32 x1,x2;
> vf32 x3,x4;
> void foo1(float16_t* input, float16_t* output, int vlen){
>     vf32 add = x3 + x4;
>     vf32 mul = x1 * x2;
>     __builtin_riscv_vlen(vlen);
>     storevf(&output[0], add);
>     storevf(&output[4], mul);
> }
> the assembly will be right:
> ➜
> foo1:
>     lui     a5,%hi(.LANCHOR0)
>     addi    a5,a5,%lo(.LANCHOR0)
>     addi    a0,a5,64
>     addi    a3,a5,128
>     addi    a4,a5,192
>     vfld    v1,a5
>     vfld    v3,a0
>     vfld    v0,a3
>     vfld    v2,a4
>     vfadd   v1,v1,v3
>     vfmul   v0,v0,v2
>     csrw    vlen,a2  <----
>     addi    a5,a1,8
>     vfst    v1,a1
>     vfst    v0,a5
>     ret
>
> Is there any other way for coding or other option for gcc compilation to deal 
> with this issue.
> Any suggestion would be appreciated. Thank you very much!

You need to present GCC with a data dependence that prevents the re-ordering
for example by adding input/outputs for add/mul like

asm volatile ("crsw\tvlen, %0" : "=r" (add), "=r" (mul) : "0" (add),
"0" (mul), "rJ" (vlen));

Richard.

> Best,
> Jin

Reply via email to