On Fri, Aug 09, 2019 at 07:00:19PM +0300, Adalbert Lazăr wrote:
> From: Nicușor Cîțu <[email protected]>
>
> This would be used either if the introspection tool request it as a
> reply to a KVMI_EVENT_PF event or to cope with instructions that cannot
> be handled by the x86 emulator during the handling of a VMEXIT. In
> these situations, all other vCPU-s are kicked and held, the EPT-based
> protection is removed and the guest is single stepped by the vCPU that
> triggered the initial VMEXIT. Upon completion the EPT-base protection
> is reinstalled and all vCPU-s all allowed to return to the guest.
>
> This is a rather slow workaround that kicks in occasionally. In the
> future, the most frequently single-stepped instructions should be added
> to the emulator (usually, stores to and from memory - SSE/AVX).
>
> For the moment it works only on Intel.
>
> CC: Jim Mattson <[email protected]>
> CC: Sean Christopherson <[email protected]>
> CC: Joerg Roedel <[email protected]>
> Signed-off-by: Nicușor Cîțu <[email protected]>
> Co-developed-by: Mihai Donțu <[email protected]>
> Signed-off-by: Mihai Donțu <[email protected]>
> Co-developed-by: Adalbert Lazăr <[email protected]>
> Signed-off-by: Adalbert Lazăr <[email protected]>
> ---
> arch/x86/include/asm/kvm_host.h | 3 +
> arch/x86/kvm/kvmi.c | 47 ++++++++++-
> arch/x86/kvm/svm.c | 5 ++
> arch/x86/kvm/vmx/vmx.c | 17 ++++
> arch/x86/kvm/x86.c | 19 +++++
> include/linux/kvmi.h | 4 +
> virt/kvm/kvmi.c | 145 +++++++++++++++++++++++++++++++-
> virt/kvm/kvmi_int.h | 16 ++++
> 8 files changed, 253 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index ad36a5fc2048..60e2c298d469 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1016,6 +1016,7 @@ struct kvm_x86_ops {
> void (*msr_intercept)(struct kvm_vcpu *vcpu, unsigned int msr,
> bool enable);
> bool (*desc_intercept)(struct kvm_vcpu *vcpu, bool enable);
> + void (*set_mtf)(struct kvm_vcpu *vcpu, bool enable);
MTF is a VMX specific implementation of single-stepping, this should be
enable_single_step() or something along those lines. For example, I assume
SVM could implement something that is mostly functional via RFLAGS.TF.
> void (*cr3_write_exiting)(struct kvm_vcpu *vcpu, bool enable);
> bool (*nested_pagefault)(struct kvm_vcpu *vcpu);
> bool (*spt_fault)(struct kvm_vcpu *vcpu);
> @@ -1628,6 +1629,8 @@ void kvm_arch_msr_intercept(struct kvm_vcpu *vcpu,
> unsigned int msr,
> bool enable);
> bool kvm_mmu_nested_pagefault(struct kvm_vcpu *vcpu);
> bool kvm_spt_fault(struct kvm_vcpu *vcpu);
> +void kvm_set_mtf(struct kvm_vcpu *vcpu, bool enable);
> +void kvm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask);
> void kvm_control_cr3_write_exiting(struct kvm_vcpu *vcpu, bool enable);
>
> #endif /* _ASM_X86_KVM_HOST_H */
> diff --git a/arch/x86/kvm/kvmi.c b/arch/x86/kvm/kvmi.c
> index 04cac5b8a4d0..f0ab4bd9eb37 100644
> --- a/arch/x86/kvm/kvmi.c
> +++ b/arch/x86/kvm/kvmi.c
> @@ -520,7 +520,6 @@ bool kvmi_arch_pf_event(struct kvm_vcpu *vcpu, gpa_t gpa,
> gva_t gva,
> u32 ctx_size;
> u64 ctx_addr;
> u32 action;
> - bool singlestep_ignored;
> bool ret = false;
>
> if (!kvm_spt_fault(vcpu))
> @@ -533,7 +532,7 @@ bool kvmi_arch_pf_event(struct kvm_vcpu *vcpu, gpa_t gpa,
> gva_t gva,
> if (ivcpu->effective_rep_complete)
> return true;
>
> - action = kvmi_msg_send_pf(vcpu, gpa, gva, access, &singlestep_ignored,
> + action = kvmi_msg_send_pf(vcpu, gpa, gva, access, &ivcpu->ss_requested,
> &ivcpu->rep_complete, &ctx_addr,
> ivcpu->ctx_data, &ctx_size);
>
> @@ -547,6 +546,8 @@ bool kvmi_arch_pf_event(struct kvm_vcpu *vcpu, gpa_t gpa,
> gva_t gva,
> ret = true;
> break;
> case KVMI_EVENT_ACTION_RETRY:
> + if (ivcpu->ss_requested && !kvmi_start_ss(vcpu, gpa, access))
> + ret = true;
> break;
> default:
> kvmi_handle_common_event_actions(vcpu, action, "PF");
> @@ -758,6 +759,48 @@ int kvmi_arch_cmd_control_cr(struct kvm_vcpu *vcpu,
> return 0;
> }
>
> +void kvmi_arch_start_single_step(struct kvm_vcpu *vcpu)
> +{
> + kvm_set_mtf(vcpu, true);
> +
> + /*
> + * Set block by STI only if the RFLAGS.IF = 1.
> + * Blocking by both STI and MOV/POP SS is not possible.
> + */
> + if (kvm_arch_interrupt_allowed(vcpu))
> + kvm_set_interrupt_shadow(vcpu, KVM_X86_SHADOW_INT_STI);
This is wrong, the STI shadow only exists if interrupts were unblocked
prior to STI. I'm guessing this is a hack to workaround
kvmi_arch_stop_single_step() not properly handling the clearing case.
> +
> +}
> +
> +void kvmi_arch_stop_single_step(struct kvm_vcpu *vcpu)
> +{
> + kvm_set_mtf(vcpu, false);
> + /*
> + * The blocking by STI is cleared after the guest
> + * executes one instruction or incurs an exception.
> + * However we migh stop the SS before entering to guest,
> + * so be sure we are clearing the STI blocking.
> + */
> + kvm_set_interrupt_shadow(vcpu, 0);
There are only three callers of kvmi_stop_ss(), it should be possible
to accurately update interruptibility:
- kvmi_run_ss() fail, do nothing
- VM-Exit that wasn't a single-step - clear interruptibility if the
guest executed an instruction (including faulted on an instr).
- MTF VM-Exit - do nothing (VMCS should already be up-to-date).
> +}
> +
> +u8 kvmi_arch_relax_page_access(u8 old, u8 new)
> +{
> + u8 ret = old | new;
> +
> + /*
> + * An SPTE entry with just the -wx bits set can trigger a
> + * misconfiguration error from the hardware, as it's the case
> + * for x86 where this access mode is used to mark I/O memory.
> + * Thus, we make sure that -wx accesses are translated to rwx.
> + */
> + if ((ret & (KVMI_PAGE_ACCESS_W | KVMI_PAGE_ACCESS_X)) ==
> + (KVMI_PAGE_ACCESS_W | KVMI_PAGE_ACCESS_X))
> + ret |= KVMI_PAGE_ACCESS_R;
> +
> + return ret;
> +}
> +
> static const struct {
> unsigned int allow_bit;
> enum kvm_page_track_mode track_mode;
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index b178b8900660..3481c0247680 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -7183,6 +7183,10 @@ static bool svm_spt_fault(struct kvm_vcpu *vcpu)
> return (svm->vmcb->control.exit_code == SVM_EXIT_NPF);
> }
>
> +static void svm_set_mtf(struct kvm_vcpu *vcpu, bool enable)
> +{
> +}
> +
> static void svm_cr3_write_exiting(struct kvm_vcpu *vcpu, bool enable)
> {
> }
> @@ -7225,6 +7229,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init =
> {
> .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
> .has_emulated_msr = svm_has_emulated_msr,
>
> + .set_mtf = svm_set_mtf,
> .cr3_write_exiting = svm_cr3_write_exiting,
> .msr_intercept = svm_msr_intercept,
> .desc_intercept = svm_desc_intercept,
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 7d1e341b51ad..f0369d0574dc 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -5384,6 +5384,7 @@ static int handle_invalid_op(struct kvm_vcpu *vcpu)
>
> static int handle_monitor_trap(struct kvm_vcpu *vcpu)
> {
> + kvmi_stop_ss(vcpu);
> return 1;
> }
>
> @@ -5992,6 +5993,11 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
> }
> }
>
> + if (kvmi_vcpu_enabled_ss(vcpu)
> + && exit_reason != EXIT_REASON_EPT_VIOLATION
> + && exit_reason != EXIT_REASON_MONITOR_TRAP_FLAG)
Bad indentation. This is prevelant through the series.
> + kvmi_stop_ss(vcpu);
> +
> if (exit_reason < kvm_vmx_max_exit_handlers
> && kvm_vmx_exit_handlers[exit_reason])
> return kvm_vmx_exit_handlers[exit_reason](vcpu);
> @@ -7842,6 +7848,16 @@ static __exit void hardware_unsetup(void)
> free_kvm_area();
> }
>
> +static void vmx_set_mtf(struct kvm_vcpu *vcpu, bool enable)
> +{
> + if (enable)
> + vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
> + CPU_BASED_MONITOR_TRAP_FLAG);
> + else
> + vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
> + CPU_BASED_MONITOR_TRAP_FLAG);
> +}
> +
> static void vmx_msr_intercept(struct kvm_vcpu *vcpu, unsigned int msr,
> bool enable)
> {
> @@ -7927,6 +7943,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init =
> {
> .cpu_has_accelerated_tpr = report_flexpriority,
> .has_emulated_msr = vmx_has_emulated_msr,
>
> + .set_mtf = vmx_set_mtf,
> .msr_intercept = vmx_msr_intercept,
> .cr3_write_exiting = vmx_cr3_write_exiting,
> .desc_intercept = vmx_desc_intercept,
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 38aaddadb93a..65855340249a 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -7358,6 +7358,13 @@ static int inject_pending_event(struct kvm_vcpu *vcpu,
> bool req_int_win)
> {
> int r;
>
> + if (kvmi_vcpu_enabled_ss(vcpu))
> + /*
> + * We cannot inject events during single-stepping.
> + * Try again later.
> + */
> + return -1;
> +
> /* try to reinject previous events if any */
>
> if (vcpu->arch.exception.injected)
> @@ -10134,6 +10141,18 @@ void kvm_control_cr3_write_exiting(struct kvm_vcpu
> *vcpu, bool enable)
> }
> EXPORT_SYMBOL(kvm_control_cr3_write_exiting);
>
> +void kvm_set_mtf(struct kvm_vcpu *vcpu, bool enable)
> +{
> + kvm_x86_ops->set_mtf(vcpu, enable);
> +}
> +EXPORT_SYMBOL(kvm_set_mtf);
> +
> +void kvm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
> +{
> + kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
> +}
> +EXPORT_SYMBOL(kvm_set_interrupt_shadow);
Why do these wrappers exist, and why are they exported? Introspection is
built into kvm, any reason not to use kvm_x86_ops directly? The most
definitely don't need to be exported.
> +
> bool kvm_spt_fault(struct kvm_vcpu *vcpu)
> {
> return kvm_x86_ops->spt_fault(vcpu);
> diff --git a/include/linux/kvmi.h b/include/linux/kvmi.h
> index 5d162b9e67f2..1dc90284dc3a 100644
> --- a/include/linux/kvmi.h
> +++ b/include/linux/kvmi.h
> @@ -22,6 +22,8 @@ bool kvmi_queue_exception(struct kvm_vcpu *vcpu);
> void kvmi_trap_event(struct kvm_vcpu *vcpu);
> bool kvmi_descriptor_event(struct kvm_vcpu *vcpu, u8 descriptor, u8 write);
> void kvmi_handle_requests(struct kvm_vcpu *vcpu);
> +void kvmi_stop_ss(struct kvm_vcpu *vcpu);
> +bool kvmi_vcpu_enabled_ss(struct kvm_vcpu *vcpu);
Spell out single step, and be consistent between single_step and singlestep.
That applies to pretty much every variable and function unless doing so
really makes the verbosity obnoxious.
> void kvmi_init_emulate(struct kvm_vcpu *vcpu);
> void kvmi_activate_rep_complete(struct kvm_vcpu *vcpu);
> bool kvmi_bp_intercepted(struct kvm_vcpu *vcpu, u32 dbg);
> @@ -44,6 +46,8 @@ static inline void kvmi_handle_requests(struct kvm_vcpu
> *vcpu) { }
> static inline bool kvmi_hypercall_event(struct kvm_vcpu *vcpu) { return
> false; }
> static inline bool kvmi_queue_exception(struct kvm_vcpu *vcpu) { return
> true; }
> static inline void kvmi_trap_event(struct kvm_vcpu *vcpu) { }
> +static inline void kvmi_stop_ss(struct kvm_vcpu *vcpu) { }
> +static inline bool kvmi_vcpu_enabled_ss(struct kvm_vcpu *vcpu) { return
> false; }
> static inline void kvmi_init_emulate(struct kvm_vcpu *vcpu) { }
> static inline void kvmi_activate_rep_complete(struct kvm_vcpu *vcpu) { }
> static inline bool kvmi_bp_intercepted(struct kvm_vcpu *vcpu, u32 dbg)
> diff --git a/virt/kvm/kvmi.c b/virt/kvm/kvmi.c
> index d47a725a4045..a3a5af9080a9 100644
> --- a/virt/kvm/kvmi.c
> +++ b/virt/kvm/kvmi.c
> @@ -1260,11 +1260,19 @@ void kvmi_run_jobs(struct kvm_vcpu *vcpu)
> }
> }
>
> +static bool need_to_wait_for_ss(struct kvm_vcpu *vcpu)
> +{
> + struct kvmi_vcpu *ivcpu = IVCPU(vcpu);
> + struct kvmi *ikvm = IKVM(vcpu->kvm);
> +
> + return atomic_read(&ikvm->ss_active) && !ivcpu->ss_owner;
> +}
> +
> static bool need_to_wait(struct kvm_vcpu *vcpu)
> {
> struct kvmi_vcpu *ivcpu = IVCPU(vcpu);
>
> - return ivcpu->reply_waiting;
> + return ivcpu->reply_waiting || need_to_wait_for_ss(vcpu);
> }
>
> static bool done_waiting(struct kvm_vcpu *vcpu)
> @@ -1572,6 +1580,141 @@ int kvmi_cmd_pause_vcpu(struct kvm_vcpu *vcpu, bool
> wait)
> return 0;
> }
>
> +void kvmi_stop_ss(struct kvm_vcpu *vcpu)
> +{
> + struct kvmi_vcpu *ivcpu = IVCPU(vcpu);
> + struct kvm *kvm = vcpu->kvm;
> + struct kvmi *ikvm;
> + int i;
> +
> + ikvm = kvmi_get(kvm);
> + if (!ikvm)
> + return;
> +
> + if (unlikely(!ivcpu->ss_owner)) {
> + kvmi_warn(ikvm, "%s\n", __func__);
> + goto out;
> + }
> +
> + for (i = ikvm->ss_level; i--;)
> + kvmi_set_gfn_access(kvm,
> + ikvm->ss_context[i].gfn,
> + ikvm->ss_context[i].old_access,
> + ikvm->ss_context[i].old_write_bitmap);
> +
> + ikvm->ss_level = 0;
> +
> + kvmi_arch_stop_single_step(vcpu);
> +
> + atomic_set(&ikvm->ss_active, false);
> + /*
> + * Make ss_active update visible
> + * before resuming all the other vCPUs.
> + */
> + smp_mb__after_atomic();
> + kvm_make_all_cpus_request(kvm, 0);
> +
> + ivcpu->ss_owner = false;
> +
> +out:
> + kvmi_put(kvm);
> +}
> +EXPORT_SYMBOL(kvmi_stop_ss);
> +
> +static bool kvmi_acquire_ss(struct kvm_vcpu *vcpu)
> +{
> + struct kvmi_vcpu *ivcpu = IVCPU(vcpu);
> + struct kvmi *ikvm = IKVM(vcpu->kvm);
> +
> + if (ivcpu->ss_owner)
> + return true;
> +
> + if (atomic_cmpxchg(&ikvm->ss_active, false, true) != false)
> + return false;
> +
> + kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_INTROSPECTION |
> + KVM_REQUEST_WAIT);
> +
> + ivcpu->ss_owner = true;
> +
> + return true;
> +}
> +
> +static bool kvmi_run_ss(struct kvm_vcpu *vcpu, gpa_t gpa, u8 access)
> +{
> + struct kvmi *ikvm = IKVM(vcpu->kvm);
> + u8 old_access, new_access;
> + u32 old_write_bitmap;
> + gfn_t gfn = gpa_to_gfn(gpa);
> + int err;
> +
> + kvmi_arch_start_single_step(vcpu);
> +
> + err = kvmi_get_gfn_access(ikvm, gfn, &old_access, &old_write_bitmap);
> + /* likely was removed from radix tree due to rwx */
> + if (err) {
> + kvmi_warn(ikvm, "%s: gfn 0x%llx not found in the radix tree\n",
> + __func__, gfn);
> + return true;
> + }
> +
> + if (ikvm->ss_level == SINGLE_STEP_MAX_DEPTH - 1) {
> + kvmi_err(ikvm, "single step limit reached\n");
> + return false;
> + }
> +
> + ikvm->ss_context[ikvm->ss_level].gfn = gfn;
> + ikvm->ss_context[ikvm->ss_level].old_access = old_access;
> + ikvm->ss_context[ikvm->ss_level].old_write_bitmap = old_write_bitmap;
> + ikvm->ss_level++;
> +
> + new_access = kvmi_arch_relax_page_access(old_access, access);
> +
> + kvmi_set_gfn_access(vcpu->kvm, gfn, new_access, old_write_bitmap);
> +
> + return true;
> +}
> +
> +bool kvmi_start_ss(struct kvm_vcpu *vcpu, gpa_t gpa, u8 access)
> +{
> + bool ret = false;
> +
> + while (!kvmi_acquire_ss(vcpu)) {
> + int err = kvmi_run_jobs_and_wait(vcpu);
> +
> + if (err) {
> + kvmi_err(IKVM(vcpu->kvm), "kvmi_acquire_ss() has
> failed\n");
> + goto out;
> + }
> + }
> +
> + if (kvmi_run_ss(vcpu, gpa, access))
> + ret = true;
> + else
> + kvmi_stop_ss(vcpu);
> +
> +out:
> + return ret;
> +}
> +
> +bool kvmi_vcpu_enabled_ss(struct kvm_vcpu *vcpu)
> +{
> + struct kvmi_vcpu *ivcpu = IVCPU(vcpu);
> + struct kvmi *ikvm;
> + bool ret;
> +
> + ikvm = kvmi_get(vcpu->kvm);
> + if (!ikvm)
> + return false;
> +
> + ret = ivcpu->ss_owner;
> +
> + kvmi_put(vcpu->kvm);
> +
> + return ret;
> +}
> +EXPORT_SYMBOL(kvmi_vcpu_enabled_ss);
> +
> static void kvmi_job_abort(struct kvm_vcpu *vcpu, void *ctx)
> {
> struct kvmi_vcpu *ivcpu = IVCPU(vcpu);
> diff --git a/virt/kvm/kvmi_int.h b/virt/kvm/kvmi_int.h
> index d7f9858d3e97..1550fe33ed48 100644
> --- a/virt/kvm/kvmi_int.h
> +++ b/virt/kvm/kvmi_int.h
> @@ -126,6 +126,9 @@ struct kvmi_vcpu {
> DECLARE_BITMAP(high, KVMI_NUM_MSR);
> } msr_mask;
>
> + bool ss_owner;
Why is single-stepping mutually exclusive across all vCPUs? Does that
always have to be the case?
> + bool ss_requested;
> +
> struct list_head job_list;
> spinlock_t job_lock;
>
> @@ -151,6 +154,15 @@ struct kvmi {
> DECLARE_BITMAP(event_allow_mask, KVMI_NUM_EVENTS);
> DECLARE_BITMAP(vm_ev_mask, KVMI_NUM_EVENTS);
>
> +#define SINGLE_STEP_MAX_DEPTH 8
> + struct {
> + gfn_t gfn;
> + u8 old_access;
> + u32 old_write_bitmap;
> + } ss_context[SINGLE_STEP_MAX_DEPTH];
> + u8 ss_level;
> + atomic_t ss_active;
Good opportunity for an unnamed struct, e.g.
struct {
struct single_step_context[...];
bool owner;
bool requested;
u8 level
atomic_t active;
} single_step;
> +
> struct {
> bool initialized;
> atomic_t enabled;
> @@ -224,6 +236,7 @@ int kvmi_add_job(struct kvm_vcpu *vcpu,
> void *ctx, void (*free_fct)(void *ctx));
> void kvmi_handle_common_event_actions(struct kvm_vcpu *vcpu, u32 action,
> const char *str);
> +bool kvmi_start_ss(struct kvm_vcpu *vcpu, gpa_t gpa, u8 access);
>
> /* arch */
> void kvmi_arch_update_page_tracking(struct kvm *kvm,
> @@ -274,6 +287,9 @@ int kvmi_arch_cmd_inject_exception(struct kvm_vcpu *vcpu,
> u8 vector,
> u64 address);
> int kvmi_arch_cmd_control_cr(struct kvm_vcpu *vcpu,
> const struct kvmi_control_cr *req);
> +void kvmi_arch_start_single_step(struct kvm_vcpu *vcpu);
> +void kvmi_arch_stop_single_step(struct kvm_vcpu *vcpu);
> +u8 kvmi_arch_relax_page_access(u8 old, u8 new);
> int kvmi_arch_cmd_control_msr(struct kvm_vcpu *vcpu,
> const struct kvmi_control_msr *req);
> int kvmi_arch_cmd_get_mtrr_type(struct kvm_vcpu *vcpu, u64 gpa, u8 *type);
_______________________________________________
Virtualization mailing list
[email protected]
https://lists.linuxfoundation.org/mailman/listinfo/virtualization