Hi, Gentle ping on this patch.
Thanks, Jim Shu On Thu, May 8, 2025 at 5:48 PM Jim Shu <jim....@sifive.com> wrote: > > Support 4-byte atomic instruction fetch when instruction is natural > aligned. > > Current implementation is not atomic because it loads instruction twice > for first and last 2 bytes. We load 4 bytes at once to keep the > atomicity. This instruction preload method only applys when instruction > is 4-byte aligned. If instruction is unaligned, it could be across pages > so that preload will trigger additional page fault. > > We encounter this issue when doing pressure test of enabling & disabling > Linux kernel ftrace. Ftrace with kernel preemption requires concurrent > modification and execution of instruction, so non-atomic instruction > fetch will cause the race condition. We may fetch the wrong instruction > which is the mixing of 2 instructions. > > Also, RISC-V Profile wants to provide this feature by HW. RVA20U64 > Ziccif protects the atomicity of instruction fetch when it is > natural aligned. > > This commit depends on the atomic read support of translator_ld in > the commit 6a9dfe1984b0c593fb0ddb52d4e70832e6201dd6. > > Signed-off-by: Jim Shu <jim....@sifive.com> > Reviewed-by: Frank Chang <frank.ch...@sifive.com> > --- > target/riscv/translate.c | 46 +++++++++++++++++++++++++++++----------- > 1 file changed, 34 insertions(+), 12 deletions(-) > > diff --git a/target/riscv/translate.c b/target/riscv/translate.c > index 85128f997b..77edf04803 100644 > --- a/target/riscv/translate.c > +++ b/target/riscv/translate.c > @@ -1222,13 +1222,35 @@ const RISCVDecoder decoder_table[] = { > > const size_t decoder_table_size = ARRAY_SIZE(decoder_table); > > -static void decode_opc(CPURISCVState *env, DisasContext *ctx, uint16_t > opcode) > +static void decode_opc(CPURISCVState *env, DisasContext *ctx) > { > + uint32_t opcode; > + bool pc_is_4byte_align = ((ctx->base.pc_next % 4) == 0); > + > ctx->virt_inst_excp = false; > - ctx->cur_insn_len = insn_len(opcode); > + if (pc_is_4byte_align) { > + /* > + * Load 4 bytes at once to make instruction fetch atomically. > + * > + * Note: When pc is 4-byte aligned, 4-byte instruction wouldn't be > + * across pages. We could preload 4 bytes instruction no matter > + * real one is 2 or 4 bytes. Instruction preload wouldn't trigger > + * additional page fault. > + */ > + opcode = translator_ldl(env, &ctx->base, ctx->base.pc_next); > + } else { > + /* > + * For unaligned pc, instruction preload may trigger additional > + * page fault so we only load 2 bytes here. > + */ > + opcode = (uint32_t) translator_lduw(env, &ctx->base, > ctx->base.pc_next); > + } > + ctx->ol = ctx->xl; > + > + ctx->cur_insn_len = insn_len((uint16_t)opcode); > /* Check for compressed insn */ > if (ctx->cur_insn_len == 2) { > - ctx->opcode = opcode; > + ctx->opcode = (uint16_t)opcode; > /* > * The Zca extension is added as way to refer to instructions in the > C > * extension that do not include the floating-point loads and stores > @@ -1238,15 +1260,17 @@ static void decode_opc(CPURISCVState *env, > DisasContext *ctx, uint16_t opcode) > return; > } > } else { > - uint32_t opcode32 = opcode; > - opcode32 = deposit32(opcode32, 16, 16, > - translator_lduw(env, &ctx->base, > - ctx->base.pc_next + 2)); > - ctx->opcode = opcode32; > + if (!pc_is_4byte_align) { > + /* Load last 2 bytes of instruction here */ > + opcode = deposit32(opcode, 16, 16, > + translator_lduw(env, &ctx->base, > + ctx->base.pc_next + 2)); > + } > + ctx->opcode = opcode; > > for (guint i = 0; i < ctx->decoders->len; ++i) { > riscv_cpu_decode_fn func = g_ptr_array_index(ctx->decoders, i); > - if (func(ctx, opcode32)) { > + if (func(ctx, opcode)) { > return; > } > } > @@ -1324,10 +1348,8 @@ static void riscv_tr_translate_insn(DisasContextBase > *dcbase, CPUState *cpu) > { > DisasContext *ctx = container_of(dcbase, DisasContext, base); > CPURISCVState *env = cpu_env(cpu); > - uint16_t opcode16 = translator_lduw(env, &ctx->base, ctx->base.pc_next); > > - ctx->ol = ctx->xl; > - decode_opc(env, ctx, opcode16); > + decode_opc(env, ctx); > ctx->base.pc_next += ctx->cur_insn_len; > > /* > -- > 2.17.1 >