The MSHV accelerator requires a x86 decoder/emulator in userland to emulate MMIO instructions. This change contains the implementations for the generalized i386 instruction decoder/emulator.
Signed-off-by: Magnus Kulke <magnusku...@linux.microsoft.com> --- include/system/mshv.h | 32 ++++ target/i386/cpu.h | 2 +- target/i386/emulate/meson.build | 7 +- target/i386/meson.build | 2 + target/i386/mshv/meson.build | 7 + target/i386/mshv/x86.c | 330 ++++++++++++++++++++++++++++++++ 6 files changed, 377 insertions(+), 3 deletions(-) create mode 100644 include/system/mshv.h create mode 100644 target/i386/mshv/meson.build create mode 100644 target/i386/mshv/x86.c diff --git a/include/system/mshv.h b/include/system/mshv.h new file mode 100644 index 0000000000..8380b92da2 --- /dev/null +++ b/include/system/mshv.h @@ -0,0 +1,32 @@ +/* + * QEMU MSHV support + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: + * Ziqiao Zhou <ziqiaoz...@microsoft.com> + * Magnus Kulke <magnusku...@microsoft.com> + * Jinank Jain <jinankj...@microsoft.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef QEMU_MSHV_INT_H +#define QEMU_MSHV_INT_H + +#ifdef COMPILING_PER_TARGET +#ifdef CONFIG_MSHV +#define CONFIG_MSHV_IS_POSSIBLE +#endif +#else +#define CONFIG_MSHV_IS_POSSIBLE +#endif + +/* cpu */ +/* EFER (technically not a register) bits */ +#define EFER_LMA ((uint64_t)0x400) +#define EFER_LME ((uint64_t)0x100) + +#endif diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 4f8ed8868e..db6a37b271 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -2101,7 +2101,7 @@ typedef struct CPUArchState { QEMUTimer *xen_periodic_timer; QemuMutex xen_timers_lock; #endif -#if defined(CONFIG_HVF) +#if defined(CONFIG_HVF) || defined(CONFIG_MSHV) X86LazyFlags lflags; void *emu_mmio_buf; #endif diff --git a/target/i386/emulate/meson.build b/target/i386/emulate/meson.build index 4edd4f462f..b6dafb6a5b 100644 --- a/target/i386/emulate/meson.build +++ b/target/i386/emulate/meson.build @@ -1,5 +1,8 @@ -i386_system_ss.add(when: [hvf, 'CONFIG_HVF'], if_true: files( +emulator_files = files( 'x86_decode.c', 'x86_emu.c', 'x86_flags.c', -)) +) + +i386_system_ss.add(when: [hvf, 'CONFIG_HVF'], if_true: emulator_files) +i386_system_ss.add(when: 'CONFIG_MSHV', if_true: emulator_files) diff --git a/target/i386/meson.build b/target/i386/meson.build index c1aacea613..6097e5c427 100644 --- a/target/i386/meson.build +++ b/target/i386/meson.build @@ -11,6 +11,7 @@ i386_ss.add(when: 'CONFIG_SEV', if_true: files('host-cpu.c', 'confidential-guest # x86 cpu type i386_ss.add(when: 'CONFIG_KVM', if_true: files('host-cpu.c')) i386_ss.add(when: 'CONFIG_HVF', if_true: files('host-cpu.c')) +i386_ss.add(when: 'CONFIG_MSHV', if_true: files('host-cpu.c')) i386_system_ss = ss.source_set() i386_system_ss.add(files( @@ -32,6 +33,7 @@ subdir('nvmm') subdir('hvf') subdir('tcg') subdir('emulate') +subdir('mshv') target_arch += {'i386': i386_ss} target_system_arch += {'i386': i386_system_ss} diff --git a/target/i386/mshv/meson.build b/target/i386/mshv/meson.build new file mode 100644 index 0000000000..8ddaa7c11d --- /dev/null +++ b/target/i386/mshv/meson.build @@ -0,0 +1,7 @@ +i386_mshv_ss = ss.source_set() + +i386_mshv_ss.add(files( + 'x86.c', +)) + +i386_system_ss.add_all(when: 'CONFIG_MSHV', if_true: i386_mshv_ss) diff --git a/target/i386/mshv/x86.c b/target/i386/mshv/x86.c new file mode 100644 index 0000000000..581710fd06 --- /dev/null +++ b/target/i386/mshv/x86.c @@ -0,0 +1,330 @@ +/* + * QEMU MSHV support + * + * Copyright Microsoft, Corp. 2025 + * + * Authors: + * Magnus Kulke <magnusku...@microsoft.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" + +#include "cpu.h" +#include "emulate/x86_decode.h" +#include "emulate/x86_emu.h" +#include "qemu/typedefs.h" +#include "qemu/error-report.h" +#include "system/mshv.h" + +/* RW or Exec segment */ +static const uint8_t RWRX_SEGMENT_TYPE = 0x2; +static const uint8_t CODE_SEGMENT_TYPE = 0x8; +static const uint8_t EXPAND_DOWN_SEGMENT_TYPE = 0x4; + +typedef enum CpuMode { + REAL_MODE, + PROTECTED_MODE, + LONG_MODE, +} CpuMode; + +static CpuMode cpu_mode(CPUState *cpu) +{ + enum CpuMode m = REAL_MODE; + + if (x86_is_protected(cpu)) { + m = PROTECTED_MODE; + + if (x86_is_long_mode(cpu)) { + m = LONG_MODE; + } + } + + return m; +} + +static bool segment_type_ro(const SegmentCache *seg) +{ + uint32_t type_ = (seg->flags >> DESC_TYPE_SHIFT) & 15; + return (type_ & (~RWRX_SEGMENT_TYPE)) == 0; +} + +static bool segment_type_code(const SegmentCache *seg) +{ + uint32_t type_ = (seg->flags >> DESC_TYPE_SHIFT) & 15; + return (type_ & CODE_SEGMENT_TYPE) != 0; +} + +static bool segment_expands_down(const SegmentCache *seg) +{ + uint32_t type_ = (seg->flags >> DESC_TYPE_SHIFT) & 15; + + if (segment_type_code(seg)) { + return false; + } + + return (type_ & EXPAND_DOWN_SEGMENT_TYPE) != 0; +} + +static uint32_t segment_limit(const SegmentCache *seg) +{ + uint32_t limit = seg->limit; + uint32_t granularity = (seg->flags & DESC_G_MASK) != 0; + + if (granularity != 0) { + limit = (limit << 12) | 0xFFF; + } + + return limit; +} + +static uint8_t segment_db(const SegmentCache *seg) +{ + return (seg->flags >> DESC_B_SHIFT) & 1; +} + +static uint32_t segment_max_limit(const SegmentCache *seg) +{ + if (segment_db(seg) != 0) { + return 0xFFFFFFFF; + } + return 0xFFFF; +} + +static int linearize(CPUState *cpu, + target_ulong logical_addr, target_ulong *linear_addr, + X86Seg seg_idx) +{ + enum CpuMode mode; + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + SegmentCache *seg = &env->segs[seg_idx]; + target_ulong base = seg->base; + target_ulong logical_addr_32b; + uint32_t limit; + /* TODO: the emulator will not pass us "write" indicator yet */ + bool write = false; + + mode = cpu_mode(cpu); + + switch (mode) { + case LONG_MODE: + if (__builtin_add_overflow(logical_addr, base, linear_addr)) { + error_report("Address overflow"); + return -1; + } + break; + case PROTECTED_MODE: + case REAL_MODE: + if (segment_type_ro(seg) && write) { + error_report("Cannot write to read-only segment"); + return -1; + } + + logical_addr_32b = logical_addr & 0xFFFFFFFF; + limit = segment_limit(seg); + + if (segment_expands_down(seg)) { + if (logical_addr_32b >= limit) { + error_report("Address exceeds limit (expands down)"); + return -1; + } + + limit = segment_max_limit(seg); + } + + if (logical_addr_32b > limit) { + error_report("Address exceeds limit %u", limit); + return -1; + } + *linear_addr = logical_addr_32b + base; + break; + default: + error_report("Unknown cpu mode: %d", mode); + return -1; + } + + return 0; +} + +bool x86_read_segment_descriptor(CPUState *cpu, + struct x86_segment_descriptor *desc, + x86_segment_selector sel) +{ + target_ulong base; + uint32_t limit; + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + target_ulong gva; + /* int ret; */ + + memset(desc, 0, sizeof(*desc)); + + /* valid gdt descriptors start from index 1 */ + if (!sel.index && GDT_SEL == sel.ti) { + return false; + } + + if (GDT_SEL == sel.ti) { + base = env->gdt.base; + limit = env->gdt.limit; + } else { + base = env->ldt.base; + limit = env->ldt.limit; + } + + if (sel.index * 8 >= limit) { + return false; + } + + gva = base + sel.index * 8; + emul_ops->read_mem(cpu, desc, gva, sizeof(*desc)); + + return true; +} + +bool x86_write_segment_descriptor(CPUState *cpu, + struct x86_segment_descriptor *desc, + x86_segment_selector sel) +{ + target_ulong base; + uint32_t limit; + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + /* int ret; */ + target_ulong gva; + + if (GDT_SEL == sel.ti) { + base = env->gdt.base; + limit = env->gdt.limit; + } else { + base = env->ldt.base; + limit = env->ldt.limit; + } + + if (sel.index * 8 >= limit) { + return false; + } + + gva = base + sel.index * 8; + emul_ops->write_mem(cpu, desc, gva, sizeof(*desc)); + + return true; +} + +bool x86_read_call_gate(CPUState *cpu, struct x86_call_gate *idt_desc, + int gate) +{ + target_ulong base; + uint32_t limit; + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + target_ulong gva; + + base = env->idt.base; + limit = env->idt.limit; + + memset(idt_desc, 0, sizeof(*idt_desc)); + if (gate * 8 >= limit) { + perror("call gate exceeds idt limit"); + return false; + } + + gva = base + gate * 8; + emul_ops->read_mem(cpu, idt_desc, gva, sizeof(*idt_desc)); + + return true; +} + +bool x86_is_protected(CPUState *cpu) +{ + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + uint64_t cr0 = env->cr[0]; + + return cr0 & CR0_PE_MASK; +} + +bool x86_is_real(CPUState *cpu) +{ + return !x86_is_protected(cpu); +} + +bool x86_is_v8086(CPUState *cpu) +{ + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + return x86_is_protected(cpu) && (env->eflags & VM_MASK); +} + +bool x86_is_long_mode(CPUState *cpu) +{ + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + uint64_t efer = env->efer; + + return ((efer & (EFER_LME | EFER_LMA)) == (EFER_LME | EFER_LMA)); +} + +bool x86_is_long64_mode(CPUState *cpu) +{ + error_report("unimplemented: is_long64_mode()"); + abort(); +} + +bool x86_is_paging_mode(CPUState *cpu) +{ + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + uint64_t cr0 = env->cr[0]; + + return cr0 & CR0_PG_MASK; +} + +bool x86_is_pae_enabled(CPUState *cpu) +{ + X86CPU *x86_cpu = X86_CPU(cpu); + CPUX86State *env = &x86_cpu->env; + uint64_t cr4 = env->cr[4]; + + return cr4 & CR4_PAE_MASK; +} + +target_ulong linear_addr(CPUState *cpu, target_ulong addr, X86Seg seg) +{ + int ret; + target_ulong linear_addr; + + /* return vmx_read_segment_base(cpu, seg) + addr; */ + ret = linearize(cpu, addr, &linear_addr, seg); + if (ret < 0) { + error_report("failed to linearize address"); + abort(); + } + + return linear_addr; +} + +target_ulong linear_addr_size(CPUState *cpu, target_ulong addr, int size, + X86Seg seg) +{ + switch (size) { + case 2: + addr = (uint16_t)addr; + break; + case 4: + addr = (uint32_t)addr; + break; + default: + break; + } + return linear_addr(cpu, addr, seg); +} + +target_ulong linear_rip(CPUState *cpu, target_ulong rip) +{ + return linear_addr(cpu, rip, R_CS); +} -- 2.34.1