Re: [PATCH 1/7] KVM: x86 emulator: framework for streamlining arithmetic opcodes

Gleb Natapov Sat, 22 Dec 2012 09:26:55 -0800

On Sat, Dec 22, 2012 at 02:26:51PM +0200, Avi Kivity wrote:
> We emulate arithmetic opcodes by executing a "similar" (same operation,
> different operands) on the cpu.  This ensures accurate emulation, esp. wrt.
> eflags.  However, the prologue and epilogue around the opcode is fairly long,
> consisting of a switch (for the operand size) and code to load and save the
> operands.  This is repeated for every opcode.
> 
> This patch introduces an alternative way to emulate arithmetic opcodes.
> Instead of the above, we have four (three on i386) functions consisting
> of just the opcode and a ret; one for each operand size.  For example:
> 
>    .align 8
>    em_notb:
>       not %al
>       ret
> 
>    .align 8
>    em_notw:
>       not %ax
>       ret
> 
>    .align 8
>    em_notl:
>       not %eax
>       ret
> 
>    .align 8
>    em_notq:
>       not %rax
>       ret
> 
> The prologue and epilogue are shared across all opcodes.  Note the functions
> use a special calling convention; notably eflags is an input/output parameter
> and is not clobbered.  Rather than dispatching the four functions through a
> jump table, the functions are declared as a constant size (8) so their address
> can be calculated.
> 
> Signed-off-by: Avi Kivity <[email protected]>
> ---
>  arch/x86/kvm/emulate.c | 41 +++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 41 insertions(+)
> 
> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
> index c7547b3..cdf7b97 100644
> --- a/arch/x86/kvm/emulate.c
> +++ b/arch/x86/kvm/emulate.c
> @@ -148,6 +148,7 @@
>  #define Aligned     ((u64)1 << 41)  /* Explicitly aligned (e.g. MOVDQA) */
>  #define Unaligned   ((u64)1 << 42)  /* Explicitly unaligned (e.g. MOVDQU) */
>  #define Avx         ((u64)1 << 43)  /* Advanced Vector Extensions */
> +#define Fastop      ((u64)1 << 44)  /* Use opcode::u.fastop */
>  
>  #define X2(x...) x, x
>  #define X3(x...) X2(x), x
> @@ -158,6 +159,27 @@
>  #define X8(x...) X4(x), X4(x)
>  #define X16(x...) X8(x), X8(x)
>  
> +#define NR_FASTOP (ilog2(sizeof(ulong)) + 1)
> +#define FASTOP_SIZE 8
> +
> +/*
> + * fastop functions have a special calling convention:
> + *
> + * dst:    [rdx]:rax  (in/out)
May be I miss something obvious but I do not see why rdx is here.


> + * src:    rbx        (in/out)
> + * src2:   rcx        (in)
> + * flags:  rflags     (in/out)
> + *
> + * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for
> + * different operand sizes can be reached by calculation, rather than a jump
> + * table (which would be bigger than the code).
> + *
> + * fastop functions are declared as taking a never-defined fastop parameter,
> + * so they can't be called from C directly.
> + */
> +
> +struct fastop;
> +
>  struct opcode {
>       u64 flags : 56;
>       u64 intercept : 8;
> @@ -166,6 +188,7 @@ struct opcode {
>               const struct opcode *group;
>               const struct group_dual *gdual;
>               const struct gprefix *gprefix;
> +             void (*fastop)(struct fastop *fake);
>       } u;
>       int (*check_perm)(struct x86_emulate_ctxt *ctxt);
>  };
> @@ -3596,6 +3619,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
>  #define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
>  #define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
>  #define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
> +#define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) }
>  #define II(_f, _e, _i) \
>       { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i }
>  #define IIP(_f, _e, _i, _p) \
> @@ -4383,6 +4407,16 @@ static void fetch_possible_mmx_operand(struct 
> x86_emulate_ctxt *ctxt,
>               read_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
>  }
>  
> +static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop 
> *))
> +{
> +     ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF;
> +     fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
> +     asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n"
> +         : "+a"(ctxt->dst.val), "+b"(ctxt->src.val), [flags]"+D"(flags)
> +     : "c"(ctxt->src2.val), [fastop]"S"(fop));
> +     ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK);
> +     return X86EMUL_CONTINUE;
> +}
>  
>  int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
>  {
> @@ -4512,6 +4546,13 @@ special_insn:
>       }
>  
>       if (ctxt->execute) {
> +             if (ctxt->d & Fastop) {
> +                     void (*fop)(struct fastop *) = (void *)ctxt->execute;
> +                     rc = fastop(ctxt, fop);
> +                     if (rc != X86EMUL_CONTINUE)
> +                             goto done;
> +                     goto writeback;
> +             }
>               rc = ctxt->execute(ctxt);
>               if (rc != X86EMUL_CONTINUE)
>                       goto done;
> -- 
> 1.7.11.7

--
                        Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 1/7] KVM: x86 emulator: framework for streamlining arithmetic opcodes

Reply via email to