> From: Nadav Har'El
> Sent: Tuesday, May 17, 2011 3:55 AM
>
> This patch contains the logic of whether an L2 exit should be handled by L0
> and then L2 should be resumed, or whether L1 should be run to handle this
> exit (using the nested_vmx_vmexit() function of the previous patch).
>
> The basic idea is to let L1 handle the exit only if it actually asked to
> trap this sort of event. For example, when L2 exits on a change to CR0,
> we check L1's CR0_GUEST_HOST_MASK to see if L1 expressed interest in any
> bit which changed; If it did, we exit to L1. But if it didn't it means that
> it is we (L0) that wished to trap this event, so we handle it ourselves.
>
> The next two patches add additional logic of what to do when an interrupt or
> exception is injected: Does L0 need to do it, should we exit to L1 to do it,
> or should we resume L2 and keep the exception to be injected later.
>
> We keep a new flag, "nested_run_pending", which can override the decision of
> which should run next, L1 or L2. nested_run_pending=1 means that we *must*
> run
> L2 next, not L1. This is necessary in particular when L1 did a VMLAUNCH of L2
> and therefore expects L2 to be run (and perhaps be injected with an event it
> specified, etc.). Nested_run_pending is especially intended to avoid switching
> to L1 in the injection decision-point described above.
>
> Signed-off-by: Nadav Har'El <[email protected]>
> ---
> arch/x86/kvm/vmx.c | 256
> ++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 255 insertions(+), 1 deletion(-)
>
> --- .before/arch/x86/kvm/vmx.c 2011-05-16 22:36:49.000000000 +0300
> +++ .after/arch/x86/kvm/vmx.c 2011-05-16 22:36:49.000000000 +0300
> @@ -351,6 +351,8 @@ struct nested_vmx {
> /* Saving the VMCS that we used for running L1 */
> struct saved_vmcs saved_vmcs01;
> u64 vmcs01_tsc_offset;
> + /* L2 must run next, and mustn't decide to exit to L1. */
> + bool nested_run_pending;
> /*
> * Guest pages referred to in vmcs02 with host-physical pointers, so
> * we must keep them pinned while L2 runs.
> @@ -870,6 +872,20 @@ static inline bool nested_cpu_has2(struc
> (vmcs12->secondary_vm_exec_control & bit);
> }
>
> +static inline bool nested_cpu_has_virtual_nmis(struct kvm_vcpu *vcpu)
> +{
> + return is_guest_mode(vcpu) &&
> + (get_vmcs12(vcpu)->pin_based_vm_exec_control &
> + PIN_BASED_VIRTUAL_NMIS);
> +}
any reason to add guest mode check here? I didn't see such check in your
earlier nested_cpu_has_xxx. It would be clearer to use existing
nested_cpu_has_xxx
along with is_guest_mode explicitly which makes such usage consistent.
> +
> +static inline bool is_exception(u32 intr_info)
> +{
> + return (intr_info & (INTR_INFO_INTR_TYPE_MASK |
> INTR_INFO_VALID_MASK))
> + == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
> +}
> +
> +static void nested_vmx_vmexit(struct kvm_vcpu *vcpu);
> static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
> struct vmcs12 *vmcs12,
> u32 reason, unsigned long qualification);
> @@ -5281,6 +5297,232 @@ static int (*kvm_vmx_exit_handlers[])(st
> static const int kvm_vmx_max_exit_handlers =
> ARRAY_SIZE(kvm_vmx_exit_handlers);
>
> +/*
> + * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
> + * rather than handle it ourselves in L0. I.e., check whether L1 expressed
> + * disinterest in the current event (read or write a specific MSR) by using
> an
> + * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
> + */
> +static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
> + struct vmcs12 *vmcs12, u32 exit_reason)
> +{
> + u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
> + gpa_t bitmap;
> +
> + if (!nested_cpu_has(get_vmcs12(vcpu),
> CPU_BASED_USE_MSR_BITMAPS))
> + return 1;
> +
> + /*
> + * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
> + * for the four combinations of read/write and low/high MSR numbers.
> + * First we need to figure out which of the four to use:
> + */
> + bitmap = vmcs12->msr_bitmap;
> + if (exit_reason == EXIT_REASON_MSR_WRITE)
> + bitmap += 2048;
> + if (msr_index >= 0xc0000000) {
> + msr_index -= 0xc0000000;
> + bitmap += 1024;
> + }
> +
> + /* Then read the msr_index'th bit from this bitmap: */
> + if (msr_index < 1024*8) {
> + unsigned char b;
> + kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1);
> + return 1 & (b >> (msr_index & 7));
> + } else
> + return 1; /* let L1 handle the wrong parameter */
> +}
> +
> +/*
> + * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
> + * rather than handle it ourselves in L0. I.e., check if L1 wanted to
> + * intercept (via guest_host_mask etc.) the current event.
> + */
> +static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
> + struct vmcs12 *vmcs12)
> +{
> + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> + int cr = exit_qualification & 15;
> + int reg = (exit_qualification >> 8) & 15;
> + unsigned long val = kvm_register_read(vcpu, reg);
> +
> + switch ((exit_qualification >> 4) & 3) {
> + case 0: /* mov to cr */
> + switch (cr) {
> + case 0:
> + if (vmcs12->cr0_guest_host_mask &
> + (val ^ vmcs12->cr0_read_shadow))
> + return 1;
> + break;
> + case 3:
> + if ((vmcs12->cr3_target_count >= 1 &&
> + vmcs12->cr3_target_value0 == val) ||
> + (vmcs12->cr3_target_count >= 2 &&
> + vmcs12->cr3_target_value1 == val) ||
> + (vmcs12->cr3_target_count >= 3 &&
> + vmcs12->cr3_target_value2 == val) ||
> + (vmcs12->cr3_target_count >= 4 &&
> + vmcs12->cr3_target_value3 == val))
> + return 0;
> + if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
> + return 1;
> + break;
> + case 4:
> + if (vmcs12->cr4_guest_host_mask &
> + (vmcs12->cr4_read_shadow ^ val))
> + return 1;
> + break;
> + case 8:
> + if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
> + return 1;
> + break;
> + }
> + break;
> + case 2: /* clts */
> + if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
> + (vmcs12->cr0_read_shadow & X86_CR0_TS))
> + return 1;
> + break;
> + case 1: /* mov from cr */
> + switch (cr) {
> + case 3:
> + if (vmcs12->cpu_based_vm_exec_control &
> + CPU_BASED_CR3_STORE_EXITING)
> + return 1;
> + break;
> + case 8:
> + if (vmcs12->cpu_based_vm_exec_control &
> + CPU_BASED_CR8_STORE_EXITING)
> + return 1;
> + break;
> + }
> + break;
> + case 3: /* lmsw */
> + /*
> + * lmsw can change bits 1..3 of cr0, and only set bit 0 of
> + * cr0. Other attempted changes are ignored, with no exit.
> + */
> + if (vmcs12->cr0_guest_host_mask & 0xe &
> + (val ^ vmcs12->cr0_read_shadow))
> + return 1;
> + if ((vmcs12->cr0_guest_host_mask & 0x1) &&
> + !(vmcs12->cr0_read_shadow & 0x1) &&
> + (val & 0x1))
> + return 1;
> + break;
> + }
> + return 0;
> +}
> +
> +/*
> + * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
> + * should handle it ourselves in L0 (and then continue L2). Only call this
> + * when in is_guest_mode (L2).
> + */
> +static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
> +{
> + u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
> + u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> + struct vcpu_vmx *vmx = to_vmx(vcpu);
> + struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> +
> + if (vmx->nested.nested_run_pending)
> + return 0;
> +
> + if (unlikely(vmx->fail)) {
> + printk(KERN_INFO "%s failed vm entry %x\n",
> + __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
> + return 1;
> + }
> +
> + switch (exit_reason) {
> + case EXIT_REASON_EXCEPTION_NMI:
> + if (!is_exception(intr_info))
> + return 0;
> + else if (is_page_fault(intr_info))
> + return enable_ept;
> + return vmcs12->exception_bitmap &
> + (1u << (intr_info & INTR_INFO_VECTOR_MASK));
> + case EXIT_REASON_EXTERNAL_INTERRUPT:
> + return 0;
> + case EXIT_REASON_TRIPLE_FAULT:
> + return 1;
> + case EXIT_REASON_PENDING_INTERRUPT:
> + case EXIT_REASON_NMI_WINDOW:
> + /*
> + * prepare_vmcs02() set the CPU_BASED_VIRTUAL_INTR_PENDING
> bit
> + * (aka Interrupt Window Exiting) only when L1 turned it on,
> + * so if we got a PENDING_INTERRUPT exit, this must be for L1.
> + * Same for NMI Window Exiting.
> + */
> + return 1;
> + case EXIT_REASON_TASK_SWITCH:
> + return 1;
> + case EXIT_REASON_CPUID:
> + return 1;
> + case EXIT_REASON_HLT:
> + return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
> + case EXIT_REASON_INVD:
> + return 1;
> + case EXIT_REASON_INVLPG:
> + return vmcs12->cpu_based_vm_exec_control &
> + CPU_BASED_INVLPG_EXITING;
use nested_cpu_has.
> + case EXIT_REASON_RDPMC:
> + return vmcs12->cpu_based_vm_exec_control &
> + CPU_BASED_RDPMC_EXITING;
> + case EXIT_REASON_RDTSC:
> + return vmcs12->cpu_based_vm_exec_control &
> + CPU_BASED_RDTSC_EXITING;
ditto
> + case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
> + case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
> + case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
> + case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
> + case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
> + /*
> + * VMX instructions trap unconditionally. This allows L1 to
> + * emulate them for its L2 guest, i.e., allows 3-level nesting!
> + */
> + return 1;
> + case EXIT_REASON_CR_ACCESS:
> + return nested_vmx_exit_handled_cr(vcpu, vmcs12);
> + case EXIT_REASON_DR_ACCESS:
> + return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
> + case EXIT_REASON_IO_INSTRUCTION:
> + /* TODO: support IO bitmaps */
> + return 1;
> + case EXIT_REASON_MSR_READ:
> + case EXIT_REASON_MSR_WRITE:
> + return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
> + case EXIT_REASON_INVALID_STATE:
> + return 1;
> + case EXIT_REASON_MWAIT_INSTRUCTION:
> + return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
> + case EXIT_REASON_MONITOR_INSTRUCTION:
> + return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
> + case EXIT_REASON_PAUSE_INSTRUCTION:
> + return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
> + nested_cpu_has2(vmcs12,
> + SECONDARY_EXEC_PAUSE_LOOP_EXITING);
> + case EXIT_REASON_MCE_DURING_VMENTRY:
> + return 0;
> + case EXIT_REASON_TPR_BELOW_THRESHOLD:
> + return 1;
> + case EXIT_REASON_APIC_ACCESS:
> + return nested_cpu_has2(vmcs12,
> + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
> + case EXIT_REASON_EPT_VIOLATION:
> + case EXIT_REASON_EPT_MISCONFIG:
> + return 0;
> + case EXIT_REASON_WBINVD:
> + return nested_cpu_has2(vmcs12,
> SECONDARY_EXEC_WBINVD_EXITING);
> + case EXIT_REASON_XSETBV:
> + return 1;
> + default:
> + return 1;
> + }
> +}
> +
> static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
> {
> *info1 = vmcs_readl(EXIT_QUALIFICATION);
> @@ -5303,6 +5545,17 @@ static int vmx_handle_exit(struct kvm_vc
> if (vmx->emulation_required && emulate_invalid_guest_state)
> return handle_invalid_guest_state(vcpu);
>
> + if (exit_reason == EXIT_REASON_VMLAUNCH ||
> + exit_reason == EXIT_REASON_VMRESUME)
> + vmx->nested.nested_run_pending = 1;
> + else
> + vmx->nested.nested_run_pending = 0;
what about VMLAUNCH invoked from L2? In such case I think you expect
L1 to run instead of L2.
On the other hand, isn't just guest mode check enough to differentiate
pending nested run? When L1 invokes VMLAUNCH/VMRESUME, guest mode
hasn't been set yet, and below check will fail. All other operations will then
be checked by nested_vmx_exit_handled...
Do I miss anything here?
> +
> + if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
> + nested_vmx_vmexit(vcpu);
> + return 1;
> + }
> +
> if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
> vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
> vcpu->run->fail_entry.hardware_entry_failure_reason
> @@ -5325,7 +5578,8 @@ static int vmx_handle_exit(struct kvm_vc
> "(0x%x) and exit reason is 0x%x\n",
> __func__, vectoring_info, exit_reason);
>
> - if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
> + if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
> + !nested_cpu_has_virtual_nmis(vcpu))) {
Would L0 want to control vNMI for L2 guest? Otherwise we just use is_guest_mode
here for the condition check?
> if (vmx_interrupt_allowed(vcpu)) {
> vmx->soft_vnmi_blocked = 0;
> } else if (vmx->vnmi_blocked_time > 1000000000LL &&
Thanks,
Kevin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html