Hi,

Gentle ping on this patch.

Thanks,
Jim Shu


On Thu, May 8, 2025 at 5:48 PM Jim Shu <jim....@sifive.com> wrote:
>
> Support 4-byte atomic instruction fetch when instruction is natural
> aligned.
>
> Current implementation is not atomic because it loads instruction twice
> for first and last 2 bytes. We load 4 bytes at once to keep the
> atomicity. This instruction preload method only applys when instruction
> is 4-byte aligned. If instruction is unaligned, it could be across pages
> so that preload will trigger additional page fault.
>
> We encounter this issue when doing pressure test of enabling & disabling
> Linux kernel ftrace. Ftrace with kernel preemption requires concurrent
> modification and execution of instruction, so non-atomic instruction
> fetch will cause the race condition. We may fetch the wrong instruction
> which is the mixing of 2 instructions.
>
> Also, RISC-V Profile wants to provide this feature by HW. RVA20U64
> Ziccif protects the atomicity of instruction fetch when it is
> natural aligned.
>
> This commit depends on the atomic read support of translator_ld in
> the commit 6a9dfe1984b0c593fb0ddb52d4e70832e6201dd6.
>
> Signed-off-by: Jim Shu <jim....@sifive.com>
> Reviewed-by: Frank Chang <frank.ch...@sifive.com>
> ---
>  target/riscv/translate.c | 46 +++++++++++++++++++++++++++++-----------
>  1 file changed, 34 insertions(+), 12 deletions(-)
>
> diff --git a/target/riscv/translate.c b/target/riscv/translate.c
> index 85128f997b..77edf04803 100644
> --- a/target/riscv/translate.c
> +++ b/target/riscv/translate.c
> @@ -1222,13 +1222,35 @@ const RISCVDecoder decoder_table[] = {
>
>  const size_t decoder_table_size = ARRAY_SIZE(decoder_table);
>
> -static void decode_opc(CPURISCVState *env, DisasContext *ctx, uint16_t 
> opcode)
> +static void decode_opc(CPURISCVState *env, DisasContext *ctx)
>  {
> +    uint32_t opcode;
> +    bool pc_is_4byte_align = ((ctx->base.pc_next % 4) == 0);
> +
>      ctx->virt_inst_excp = false;
> -    ctx->cur_insn_len = insn_len(opcode);
> +    if (pc_is_4byte_align) {
> +        /*
> +         * Load 4 bytes at once to make instruction fetch atomically.
> +         *
> +         * Note: When pc is 4-byte aligned, 4-byte instruction wouldn't be
> +         * across pages. We could preload 4 bytes instruction no matter
> +         * real one is 2 or 4 bytes. Instruction preload wouldn't trigger
> +         * additional page fault.
> +         */
> +        opcode = translator_ldl(env, &ctx->base, ctx->base.pc_next);
> +    } else {
> +        /*
> +         * For unaligned pc, instruction preload may trigger additional
> +         * page fault so we only load 2 bytes here.
> +         */
> +        opcode = (uint32_t) translator_lduw(env, &ctx->base, 
> ctx->base.pc_next);
> +    }
> +    ctx->ol = ctx->xl;
> +
> +    ctx->cur_insn_len = insn_len((uint16_t)opcode);
>      /* Check for compressed insn */
>      if (ctx->cur_insn_len == 2) {
> -        ctx->opcode = opcode;
> +        ctx->opcode = (uint16_t)opcode;
>          /*
>           * The Zca extension is added as way to refer to instructions in the 
> C
>           * extension that do not include the floating-point loads and stores
> @@ -1238,15 +1260,17 @@ static void decode_opc(CPURISCVState *env, 
> DisasContext *ctx, uint16_t opcode)
>              return;
>          }
>      } else {
> -        uint32_t opcode32 = opcode;
> -        opcode32 = deposit32(opcode32, 16, 16,
> -                             translator_lduw(env, &ctx->base,
> -                                             ctx->base.pc_next + 2));
> -        ctx->opcode = opcode32;
> +        if (!pc_is_4byte_align) {
> +            /* Load last 2 bytes of instruction here */
> +            opcode = deposit32(opcode, 16, 16,
> +                               translator_lduw(env, &ctx->base,
> +                                               ctx->base.pc_next + 2));
> +        }
> +        ctx->opcode = opcode;
>
>          for (guint i = 0; i < ctx->decoders->len; ++i) {
>              riscv_cpu_decode_fn func = g_ptr_array_index(ctx->decoders, i);
> -            if (func(ctx, opcode32)) {
> +            if (func(ctx, opcode)) {
>                  return;
>              }
>          }
> @@ -1324,10 +1348,8 @@ static void riscv_tr_translate_insn(DisasContextBase 
> *dcbase, CPUState *cpu)
>  {
>      DisasContext *ctx = container_of(dcbase, DisasContext, base);
>      CPURISCVState *env = cpu_env(cpu);
> -    uint16_t opcode16 = translator_lduw(env, &ctx->base, ctx->base.pc_next);
>
> -    ctx->ol = ctx->xl;
> -    decode_opc(env, ctx, opcode16);
> +    decode_opc(env, ctx);
>      ctx->base.pc_next += ctx->cur_insn_len;
>
>      /*
> --
> 2.17.1
>

Reply via email to