> From: Nadav Har'El
> Sent: Tuesday, May 17, 2011 3:55 AM
> 
> This patch contains the logic of whether an L2 exit should be handled by L0
> and then L2 should be resumed, or whether L1 should be run to handle this
> exit (using the nested_vmx_vmexit() function of the previous patch).
> 
> The basic idea is to let L1 handle the exit only if it actually asked to
> trap this sort of event. For example, when L2 exits on a change to CR0,
> we check L1's CR0_GUEST_HOST_MASK to see if L1 expressed interest in any
> bit which changed; If it did, we exit to L1. But if it didn't it means that
> it is we (L0) that wished to trap this event, so we handle it ourselves.
> 
> The next two patches add additional logic of what to do when an interrupt or
> exception is injected: Does L0 need to do it, should we exit to L1 to do it,
> or should we resume L2 and keep the exception to be injected later.
> 
> We keep a new flag, "nested_run_pending", which can override the decision of
> which should run next, L1 or L2. nested_run_pending=1 means that we *must*
> run
> L2 next, not L1. This is necessary in particular when L1 did a VMLAUNCH of L2
> and therefore expects L2 to be run (and perhaps be injected with an event it
> specified, etc.). Nested_run_pending is especially intended to avoid switching
> to L1 in the injection decision-point described above.
> 
> Signed-off-by: Nadav Har'El <n...@il.ibm.com>
> ---
>  arch/x86/kvm/vmx.c |  256
> ++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 255 insertions(+), 1 deletion(-)
> 
> --- .before/arch/x86/kvm/vmx.c        2011-05-16 22:36:49.000000000 +0300
> +++ .after/arch/x86/kvm/vmx.c 2011-05-16 22:36:49.000000000 +0300
> @@ -351,6 +351,8 @@ struct nested_vmx {
>       /* Saving the VMCS that we used for running L1 */
>       struct saved_vmcs saved_vmcs01;
>       u64 vmcs01_tsc_offset;
> +     /* L2 must run next, and mustn't decide to exit to L1. */
> +     bool nested_run_pending;
>       /*
>        * Guest pages referred to in vmcs02 with host-physical pointers, so
>        * we must keep them pinned while L2 runs.
> @@ -870,6 +872,20 @@ static inline bool nested_cpu_has2(struc
>               (vmcs12->secondary_vm_exec_control & bit);
>  }
> 
> +static inline bool nested_cpu_has_virtual_nmis(struct kvm_vcpu *vcpu)
> +{
> +     return is_guest_mode(vcpu) &&
> +             (get_vmcs12(vcpu)->pin_based_vm_exec_control &
> +                     PIN_BASED_VIRTUAL_NMIS);
> +}

any reason to add guest mode check here? I didn't see such check in your
earlier nested_cpu_has_xxx. It would be clearer to use existing 
nested_cpu_has_xxx
along with is_guest_mode explicitly which makes such usage consistent.

> +
> +static inline bool is_exception(u32 intr_info)
> +{
> +     return (intr_info & (INTR_INFO_INTR_TYPE_MASK |
> INTR_INFO_VALID_MASK))
> +             == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
> +}
> +
> +static void nested_vmx_vmexit(struct kvm_vcpu *vcpu);
>  static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
>                       struct vmcs12 *vmcs12,
>                       u32 reason, unsigned long qualification);
> @@ -5281,6 +5297,232 @@ static int (*kvm_vmx_exit_handlers[])(st
>  static const int kvm_vmx_max_exit_handlers =
>       ARRAY_SIZE(kvm_vmx_exit_handlers);
> 
> +/*
> + * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
> + * rather than handle it ourselves in L0. I.e., check whether L1 expressed
> + * disinterest in the current event (read or write a specific MSR) by using 
> an
> + * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
> + */
> +static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
> +     struct vmcs12 *vmcs12, u32 exit_reason)
> +{
> +     u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
> +     gpa_t bitmap;
> +
> +     if (!nested_cpu_has(get_vmcs12(vcpu),
> CPU_BASED_USE_MSR_BITMAPS))
> +             return 1;
> +
> +     /*
> +      * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
> +      * for the four combinations of read/write and low/high MSR numbers.
> +      * First we need to figure out which of the four to use:
> +      */
> +     bitmap = vmcs12->msr_bitmap;
> +     if (exit_reason == EXIT_REASON_MSR_WRITE)
> +             bitmap += 2048;
> +     if (msr_index >= 0xc0000000) {
> +             msr_index -= 0xc0000000;
> +             bitmap += 1024;
> +     }
> +
> +     /* Then read the msr_index'th bit from this bitmap: */
> +     if (msr_index < 1024*8) {
> +             unsigned char b;
> +             kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1);
> +             return 1 & (b >> (msr_index & 7));
> +     } else
> +             return 1; /* let L1 handle the wrong parameter */
> +}
> +
> +/*
> + * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
> + * rather than handle it ourselves in L0. I.e., check if L1 wanted to
> + * intercept (via guest_host_mask etc.) the current event.
> + */
> +static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
> +     struct vmcs12 *vmcs12)
> +{
> +     unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> +     int cr = exit_qualification & 15;
> +     int reg = (exit_qualification >> 8) & 15;
> +     unsigned long val = kvm_register_read(vcpu, reg);
> +
> +     switch ((exit_qualification >> 4) & 3) {
> +     case 0: /* mov to cr */
> +             switch (cr) {
> +             case 0:
> +                     if (vmcs12->cr0_guest_host_mask &
> +                         (val ^ vmcs12->cr0_read_shadow))
> +                             return 1;
> +                     break;
> +             case 3:
> +                     if ((vmcs12->cr3_target_count >= 1 &&
> +                                     vmcs12->cr3_target_value0 == val) ||
> +                             (vmcs12->cr3_target_count >= 2 &&
> +                                     vmcs12->cr3_target_value1 == val) ||
> +                             (vmcs12->cr3_target_count >= 3 &&
> +                                     vmcs12->cr3_target_value2 == val) ||
> +                             (vmcs12->cr3_target_count >= 4 &&
> +                                     vmcs12->cr3_target_value3 == val))
> +                             return 0;
> +                     if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
> +                             return 1;
> +                     break;
> +             case 4:
> +                     if (vmcs12->cr4_guest_host_mask &
> +                         (vmcs12->cr4_read_shadow ^ val))
> +                             return 1;
> +                     break;
> +             case 8:
> +                     if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
> +                             return 1;
> +                     break;
> +             }
> +             break;
> +     case 2: /* clts */
> +             if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
> +                 (vmcs12->cr0_read_shadow & X86_CR0_TS))
> +                     return 1;
> +             break;
> +     case 1: /* mov from cr */
> +             switch (cr) {
> +             case 3:
> +                     if (vmcs12->cpu_based_vm_exec_control &
> +                         CPU_BASED_CR3_STORE_EXITING)
> +                             return 1;
> +                     break;
> +             case 8:
> +                     if (vmcs12->cpu_based_vm_exec_control &
> +                         CPU_BASED_CR8_STORE_EXITING)
> +                             return 1;
> +                     break;
> +             }
> +             break;
> +     case 3: /* lmsw */
> +             /*
> +              * lmsw can change bits 1..3 of cr0, and only set bit 0 of
> +              * cr0. Other attempted changes are ignored, with no exit.
> +              */
> +             if (vmcs12->cr0_guest_host_mask & 0xe &
> +                 (val ^ vmcs12->cr0_read_shadow))
> +                     return 1;
> +             if ((vmcs12->cr0_guest_host_mask & 0x1) &&
> +                 !(vmcs12->cr0_read_shadow & 0x1) &&
> +                 (val & 0x1))
> +                     return 1;
> +             break;
> +     }
> +     return 0;
> +}
> +
> +/*
> + * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
> + * should handle it ourselves in L0 (and then continue L2). Only call this
> + * when in is_guest_mode (L2).
> + */
> +static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
> +{
> +     u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
> +     u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> +     struct vcpu_vmx *vmx = to_vmx(vcpu);
> +     struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> +
> +     if (vmx->nested.nested_run_pending)
> +             return 0;
> +
> +     if (unlikely(vmx->fail)) {
> +             printk(KERN_INFO "%s failed vm entry %x\n",
> +                    __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
> +             return 1;
> +     }
> +
> +     switch (exit_reason) {
> +     case EXIT_REASON_EXCEPTION_NMI:
> +             if (!is_exception(intr_info))
> +                     return 0;
> +             else if (is_page_fault(intr_info))
> +                     return enable_ept;
> +             return vmcs12->exception_bitmap &
> +                             (1u << (intr_info & INTR_INFO_VECTOR_MASK));
> +     case EXIT_REASON_EXTERNAL_INTERRUPT:
> +             return 0;
> +     case EXIT_REASON_TRIPLE_FAULT:
> +             return 1;
> +     case EXIT_REASON_PENDING_INTERRUPT:
> +     case EXIT_REASON_NMI_WINDOW:
> +             /*
> +              * prepare_vmcs02() set the CPU_BASED_VIRTUAL_INTR_PENDING
> bit
> +              * (aka Interrupt Window Exiting) only when L1 turned it on,
> +              * so if we got a PENDING_INTERRUPT exit, this must be for L1.
> +              * Same for NMI Window Exiting.
> +              */
> +             return 1;
> +     case EXIT_REASON_TASK_SWITCH:
> +             return 1;
> +     case EXIT_REASON_CPUID:
> +             return 1;
> +     case EXIT_REASON_HLT:
> +             return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
> +     case EXIT_REASON_INVD:
> +             return 1;
> +     case EXIT_REASON_INVLPG:
> +             return vmcs12->cpu_based_vm_exec_control &
> +                             CPU_BASED_INVLPG_EXITING;

use nested_cpu_has.

> +     case EXIT_REASON_RDPMC:
> +             return vmcs12->cpu_based_vm_exec_control &
> +                             CPU_BASED_RDPMC_EXITING;
> +     case EXIT_REASON_RDTSC:
> +             return vmcs12->cpu_based_vm_exec_control &
> +                             CPU_BASED_RDTSC_EXITING;

ditto

> +     case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
> +     case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
> +     case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
> +     case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
> +     case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
> +             /*
> +              * VMX instructions trap unconditionally. This allows L1 to
> +              * emulate them for its L2 guest, i.e., allows 3-level nesting!
> +              */
> +             return 1;
> +     case EXIT_REASON_CR_ACCESS:
> +             return nested_vmx_exit_handled_cr(vcpu, vmcs12);
> +     case EXIT_REASON_DR_ACCESS:
> +             return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
> +     case EXIT_REASON_IO_INSTRUCTION:
> +             /* TODO: support IO bitmaps */
> +             return 1;
> +     case EXIT_REASON_MSR_READ:
> +     case EXIT_REASON_MSR_WRITE:
> +             return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
> +     case EXIT_REASON_INVALID_STATE:
> +             return 1;
> +     case EXIT_REASON_MWAIT_INSTRUCTION:
> +             return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
> +     case EXIT_REASON_MONITOR_INSTRUCTION:
> +             return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
> +     case EXIT_REASON_PAUSE_INSTRUCTION:
> +             return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
> +                     nested_cpu_has2(vmcs12,
> +                             SECONDARY_EXEC_PAUSE_LOOP_EXITING);
> +     case EXIT_REASON_MCE_DURING_VMENTRY:
> +             return 0;
> +     case EXIT_REASON_TPR_BELOW_THRESHOLD:
> +             return 1;
> +     case EXIT_REASON_APIC_ACCESS:
> +             return nested_cpu_has2(vmcs12,
> +                     SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
> +     case EXIT_REASON_EPT_VIOLATION:
> +     case EXIT_REASON_EPT_MISCONFIG:
> +             return 0;
> +     case EXIT_REASON_WBINVD:
> +             return nested_cpu_has2(vmcs12,
> SECONDARY_EXEC_WBINVD_EXITING);
> +     case EXIT_REASON_XSETBV:
> +             return 1;
> +     default:
> +             return 1;
> +     }
> +}
> +
>  static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
>  {
>       *info1 = vmcs_readl(EXIT_QUALIFICATION);
> @@ -5303,6 +5545,17 @@ static int vmx_handle_exit(struct kvm_vc
>       if (vmx->emulation_required && emulate_invalid_guest_state)
>               return handle_invalid_guest_state(vcpu);
> 
> +     if (exit_reason == EXIT_REASON_VMLAUNCH ||
> +         exit_reason == EXIT_REASON_VMRESUME)
> +             vmx->nested.nested_run_pending = 1;
> +     else
> +             vmx->nested.nested_run_pending = 0;

what about VMLAUNCH invoked from L2? In such case I think you expect
L1 to run instead of L2.

On the other hand, isn't just guest mode check enough to differentiate
pending nested run? When L1 invokes VMLAUNCH/VMRESUME, guest mode
hasn't been set yet, and below check will fail. All other operations will then
be checked by nested_vmx_exit_handled...

Do I miss anything here?

> +
> +     if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
> +             nested_vmx_vmexit(vcpu);
> +             return 1;
> +     }
> +
>       if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
>               vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
>               vcpu->run->fail_entry.hardware_entry_failure_reason
> @@ -5325,7 +5578,8 @@ static int vmx_handle_exit(struct kvm_vc
>                      "(0x%x) and exit reason is 0x%x\n",
>                      __func__, vectoring_info, exit_reason);
> 
> -     if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
> +     if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
> +                     !nested_cpu_has_virtual_nmis(vcpu))) {

Would L0 want to control vNMI for L2 guest? Otherwise we just use is_guest_mode
here for the condition check?

>               if (vmx_interrupt_allowed(vcpu)) {
>                       vmx->soft_vnmi_blocked = 0;
>               } else if (vmx->vnmi_blocked_time > 1000000000LL &&


Thanks,
Kevin
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to