stores with tcg nodes.

Nicholas Piggin Thu, 04 Sep 2025 10:21:58 -0700

On Wed, Sep 03, 2025 at 09:52:01PM +0800, Chao Liu wrote:
> From: Chao Liu <chao....@yeah.net>


> +static void gen_ldst_stride_main_loop(DisasContext *s, TCGv dest, uint32_t 
> rs1,
> +                                      uint32_t rs2, uint32_t vm, uint32_t nf,
> +                                      gen_tl_ldst *ld_fn, gen_tl_ldst *st_fn,
> +                                      bool is_load)
> +{
> +    TCGv addr = tcg_temp_new();
> +    TCGv base = get_gpr(s, rs1, EXT_NONE);
> +    TCGv stride = get_gpr(s, rs2, EXT_NONE);
> +
> +    TCGv i = tcg_temp_new();
> +    TCGv i_esz = tcg_temp_new();
> +    TCGv k = tcg_temp_new();
> +    TCGv k_esz = tcg_temp_new();
> +    TCGv k_max = tcg_temp_new();
> +    TCGv mask = tcg_temp_new();
> +    TCGv mask_offs = tcg_temp_new();
> +    TCGv mask_offs_64 = tcg_temp_new();
> +    TCGv mask_elem = tcg_temp_new();
> +    TCGv mask_offs_rem = tcg_temp_new();
> +    TCGv vreg = tcg_temp_new();
> +    TCGv dest_offs = tcg_temp_new();
> +    TCGv stride_offs = tcg_temp_new();
> +
> +    uint32_t max_elems = MAXSZ(s) >> s->sew;
> +
> +    TCGLabel *start = gen_new_label();
> +    TCGLabel *end = gen_new_label();
> +    TCGLabel *start_k = gen_new_label();
> +    TCGLabel *inc_k = gen_new_label();
> +    TCGLabel *end_k = gen_new_label();
> +
> +    MemOp atomicity = MO_ATOM_NONE;
> +    if (s->sew == 0) {
> +        atomicity = MO_ATOM_NONE;
> +    } else {
> +        atomicity = MO_ATOM_IFALIGN_PAIR;
> +    }
> +
> +    tcg_gen_addi_tl(mask, (TCGv)tcg_env, vreg_ofs(s, 0));
> +
> +    /* Start of outer loop. */
> +    tcg_gen_mov_tl(i, cpu_vstart);
> +    gen_set_label(start);
> +    tcg_gen_brcond_tl(TCG_COND_GE, i, cpu_vl, end);
> +    tcg_gen_shli_tl(i_esz, i, s->sew);
> +    /* Start of inner loop. */
> +    tcg_gen_movi_tl(k, 0);
> +    gen_set_label(start_k);
> +    tcg_gen_brcond_tl(TCG_COND_GE, k, tcg_constant_tl(nf), end_k);
> +    /*
> +     * If we are in mask agnostic regime and the operation is not unmasked we
> +     * set the inactive elements to 1.
> +     */
> +    if (!vm && s->vma) {
> +        TCGLabel *active_element = gen_new_label();
> +        /* (i + k * max_elems) * esz */
> +        tcg_gen_shli_tl(mask_offs, k, get_log2(max_elems << s->sew));
> +        tcg_gen_add_tl(mask_offs, mask_offs, i_esz);
> +
> +        /*
> +         * Check whether the i bit of the mask is 0 or 1.
> +         *
> +         * static inline int vext_elem_mask(void *v0, int index)
> +         * {
> +         *     int idx = index / 64;
> +         *     int pos = index  % 64;
> +         *     return (((uint64_t *)v0)[idx] >> pos) & 1;
> +         * }
> +         */
> +        tcg_gen_shri_tl(mask_offs_64, mask_offs, 3);
> +        tcg_gen_add_tl(mask_offs_64, mask_offs_64, mask);
> +        tcg_gen_ld_i64((TCGv_i64)mask_elem, (TCGv_ptr)mask_offs_64, 0);
> +        tcg_gen_rem_tl(mask_offs_rem, mask_offs, tcg_constant_tl(8));

Could this be faster with tcg_gen_andi_tl(mask_offs_rem, mask_offs, 7)?

> +        tcg_gen_shr_tl(mask_elem, mask_elem, mask_offs_rem);
> +        tcg_gen_andi_tl(mask_elem, mask_elem, 1);
> +        tcg_gen_brcond_tl(TCG_COND_NE, mask_elem, tcg_constant_tl(0),
> +                          active_element);
> +        /*
> +         * Set masked-off elements in the destination vector register to 1s.
> +         * Store instructions simply skip this bit as memory ops access 
> memory
> +         * only for active elements.
> +         */
> +        if (is_load) {
> +            tcg_gen_shli_tl(mask_offs, mask_offs, s->sew);
> +            tcg_gen_add_tl(mask_offs, mask_offs, dest);
> +            st_fn(tcg_constant_tl(-1), (TCGv_ptr)mask_offs, 0);

One more small thing. This is implementing load and store instructions,
yet there are these ld_fn and st_fn, which makes it a bit confusing.
Calling them something like read_vreg / write_vreg might help make them
a bit clearer.

I also suggest try to split the at generation of access for the vector
data and mask registers to see if you can split them into helper
functions. That could make this function smaller and a bit nicer to
review.

Thanks,
Nick

Re: [PATCH v6 1/2] target/riscv: Generate strided vector loads/stores with tcg nodes.

Reply via email to