On Thu, Dec 13, 2012 at 03:29:21AM +0800, Xiao Guangrong wrote:
> On 12/12/2012 09:09 AM, Marcelo Tosatti wrote:
> > On Mon, Dec 10, 2012 at 05:14:47PM +0800, Xiao Guangrong wrote:
> >> The current reexecute_instruction can not well detect the failed 
> >> instruction
> >> emulation. It allows guest to retry all the instructions except it accesses
> >> on error pfn
> >>
> >> For example, some cases are nested-write-protect - if the page we want to
> >> write is used as PDE but it chains to itself. Under this case, we should
> >> stop the emulation and report the case to userspace
> >>
> >> Signed-off-by: Xiao Guangrong <xiaoguangr...@linux.vnet.ibm.com>
> >> ---
> >>  arch/x86/include/asm/kvm_host.h |    2 +
> >>  arch/x86/kvm/paging_tmpl.h      |    2 +
> >>  arch/x86/kvm/x86.c              |   82 
> >> +++++++++++++++++++++++++++++----------
> >>  3 files changed, 65 insertions(+), 21 deletions(-)
> >>
> >> diff --git a/arch/x86/include/asm/kvm_host.h 
> >> b/arch/x86/include/asm/kvm_host.h
> >> index dc87b65..8d01c02 100644
> >> --- a/arch/x86/include/asm/kvm_host.h
> >> +++ b/arch/x86/include/asm/kvm_host.h
> >> @@ -575,6 +575,8 @@ struct kvm_arch {
> >>    u64 hv_guest_os_id;
> >>    u64 hv_hypercall;
> >>
> >> +  /* synchronizing reexecute_instruction and page fault path. */
> >> +  u64 page_fault_count;
> >>    #ifdef CONFIG_KVM_MMU_AUDIT
> >>    int audit_point;
> >>    #endif
> >> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> >> index 32d77ff..85b8e0e 100644
> >> --- a/arch/x86/kvm/paging_tmpl.h
> >> +++ b/arch/x86/kvm/paging_tmpl.h
> >> @@ -614,6 +614,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, 
> >> gva_t addr, u32 error_code,
> >>    if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
> >>            goto out_unlock;
> >>
> >> +  vcpu->kvm->arch.page_fault_count++;
> >> +
> >>    kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
> >>    kvm_mmu_free_some_pages(vcpu);
> >>    if (!force_pt_level)
> >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> >> index 3796f8c..5677869 100644
> >> --- a/arch/x86/kvm/x86.c
> >> +++ b/arch/x86/kvm/x86.c
> >> @@ -4756,29 +4756,27 @@ static int handle_emulation_failure(struct 
> >> kvm_vcpu *vcpu)
> >>  static bool reexecute_instruction(struct kvm_vcpu *vcpu, unsigned long 
> >> cr2)
> >>  {
> >>    gpa_t gpa = cr2;
> >> +  gfn_t gfn;
> >>    pfn_t pfn;
> >> -  unsigned int indirect_shadow_pages;
> >> -
> >> -  spin_lock(&vcpu->kvm->mmu_lock);
> >> -  indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
> >> -  spin_unlock(&vcpu->kvm->mmu_lock);
> >> -
> >> -  if (!indirect_shadow_pages)
> >> -          return false;
> >> +  u64 page_fault_count;
> >> +  int emulate;
> >>
> >>    if (!vcpu->arch.mmu.direct_map) {
> >> -          gpa = kvm_mmu_gva_to_gpa_read(vcpu, cr2, NULL);
> >> +          /*
> >> +           * Write permission should be allowed since only
> >> +           * write access need to be emulated.
> >> +           */
> >> +          gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
> >> +
> >> +          /*
> >> +           * If the mapping is invalid in guest, let cpu retry
> >> +           * it to generate fault.
> >> +           */
> >>            if (gpa == UNMAPPED_GVA)
> >> -                  return true; /* let cpu generate fault */
> >> +                  return true;
> >>    }
> >>
> >> -  /*
> >> -   * if emulation was due to access to shadowed page table
> >> -   * and it failed try to unshadow page and re-enter the
> >> -   * guest to let CPU execute the instruction.
> >> -   */
> >> -  if (kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)))
> >> -          return true;
> >> +  gfn = gpa_to_gfn(gpa);
> >>
> >>    /*
> >>     * Do not retry the unhandleable instruction if it faults on the
> >> @@ -4786,13 +4784,55 @@ static bool reexecute_instruction(struct kvm_vcpu 
> >> *vcpu, unsigned long cr2)
> >>     * retry instruction -> write #PF -> emulation fail -> retry
> >>     * instruction -> ...
> >>     */
> >> -  pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
> >> -  if (!is_error_noslot_pfn(pfn)) {
> >> -          kvm_release_pfn_clean(pfn);
> >> +  pfn = gfn_to_pfn(vcpu->kvm, gfn);
> >> +
> >> +  /*
> >> +   * If the instruction failed on the error pfn, it can not be fixed,
> >> +   * report the error to userspace.
> >> +   */
> >> +  if (is_error_noslot_pfn(pfn))
> >> +          return false;
> >> +
> >> +  kvm_release_pfn_clean(pfn);
> >> +
> >> +  /* The instructions are well-emulated on direct mmu. */
> >> +  if (vcpu->arch.mmu.direct_map) {
> >> +          unsigned int indirect_shadow_pages;
> >> +
> >> +          spin_lock(&vcpu->kvm->mmu_lock);
> >> +          indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
> >> +          spin_unlock(&vcpu->kvm->mmu_lock);
> >> +
> >> +          if (indirect_shadow_pages)
> >> +                  kvm_mmu_unprotect_page(vcpu->kvm, gfn);
> >> +
> >>            return true;
> >>    }
> >>
> >> -  return false;
> >> +again:
> >> +  page_fault_count = ACCESS_ONCE(vcpu->kvm->arch.page_fault_count);
> >> +
> >> +  /*
> >> +   * The instruction emulation is caused by fault access on cr2.
> >> +   * After unprotect the target page, we call
> >> +   * vcpu->arch.mmu.page_fault to fix the mapping of cr2. If it
> >> +   * return 1, mmu can not fix the mapping, we should report the
> >> +   * error, otherwise it is good to return to guest and let it
> >> +   * re-execute the instruction again.
> >> +   *
> >> +   * page_fault_count is used to avoid the race on other vcpus,
> >> +   * since after we unprotect the target page, other cpu can enter
> >> +   * page fault path and let the page be write-protected again.
> >> +   */
> >> +  kvm_mmu_unprotect_page(vcpu->kvm, gfn);
> >> +  emulate = vcpu->arch.mmu.page_fault(vcpu, cr2, PFERR_WRITE_MASK, false);
> >> +
> >> +  /* The page fault path called above can increase the count. */
> >> +  if (page_fault_count + 1 !=
> >> +            ACCESS_ONCE(vcpu->kvm->arch.page_fault_count))
> >> +          goto again;
> >> +
> >> +  return !emulate;
> >>  }
> >>
> >>  static bool retry_instruction(struct x86_emulate_ctxt *ctxt,\
> > 
> > Same comment as before: the only case where it should not attempt to 
> > emulate is when there is a condition which makes it impossible to fix
> > (the information is available to detect that condition).
> > 
> > The earlier suggestion
> > 
> >     "How about recording the gfn number for shadow pages that have been
> > shadowed in the current pagefault run?"
> > 
> > Was about that.
> 
> I think we can have a try. Is this change good to you, Marcelo?
> 
> [eric@localhost kvm]$ git diff
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 01d7c2a..e3d0001 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -4359,24 +4359,34 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm 
> *kvm)
>         return nr_mmu_pages;
>  }
> 
> -int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
> +void kvm_mmu_get_sp_hierarchy(struct kvm_vcpu *vcpu, u64 addr,
> +                             struct kvm_mmu_sp_hierarchy *hierarchy)
>  {
>         struct kvm_shadow_walk_iterator iterator;
>         u64 spte;
> -       int nr_sptes = 0;
> +
> +       hierarchy->max_level = hierarchy->nr_levels = 0;
> 
>         walk_shadow_page_lockless_begin(vcpu);
>         for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
> -               sptes[iterator.level-1] = spte;
> -               nr_sptes++;
> +               struct kvm_mmu_page *sp =  page_header(__pa(iterator.sptep));
> +
> +               if (hierarchy->indirect_only && sp->role.direct)
> +                       break;
> +
> +               if (!hierarchy->max_level)
> +                       hierarchy->max_level = iterator.level;
> +
> +               hierarchy->shadow_gfns[iterator.level-1] = sp->gfn;
> +               hierarchy->sptes[iterator.level-1] = spte;
> +               hierarchy->nr_levels++;
> +
>                 if (!is_shadow_present_pte(spte))
>                         break;
>         }
>         walk_shadow_page_lockless_end(vcpu);
> -
> -       return nr_sptes;
>  }

Record gfns while shadowing in the vcpu struct, in a struct, along with cr2.
Then validate 
That way its guaranteed its not some other vcpu.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to