Sean Christopherson <sea...@google.com> writes: ...
> > It'll take more work than the below, e.g. to have VMX's construct_eptp() pull > the > level and A/D bits from kvm_mmu_page (vendor code can get at the kvm_mmu_page > with > root_to_sp()), but for the core concept/skeleton, I think this is it? > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index 6e838cb6c9e1..298130445182 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -3839,6 +3839,37 @@ void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, > struct kvm_mmu *mmu) > } > EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots); > > +struct kvm_tlb_flush_root { > + struct kvm *kvm; > + hpa_t root; > +}; > + > +static void kvm_flush_tlb_root(void *__data) > +{ > + struct kvm_tlb_flush_root *data = __data; > + > + kvm_x86_call(flush_tlb_root)(data->kvm, data->root); > +} > + > +void kvm_mmu_flush_all_tlbs_root(struct kvm *kvm, struct kvm_mmu_page *root) > +{ > + struct kvm_tlb_flush_root data = { > + .kvm = kvm, > + .root = __pa(root->spt), > + }; > + > + /* > + * Flush any TLB entries for the new root, the provenance of the root > + * is unknown. Even if KVM ensures there are no stale TLB entries > + * for a freed root, in theory another hypervisor could have left > + * stale entries. Flushing on alloc also allows KVM to skip the TLB > + * flush when freeing a root (see kvm_tdp_mmu_put_root()), and > flushing > + * TLBs on all CPUs allows KVM to elide TLB flushes when a vCPU is > + * migrated to a different pCPU. > + */ > + on_each_cpu(kvm_flush_tlb_root, &data, 1); Would it make sense to complement this with e.g. a CPU mask tracking all the pCPUs where the VM has ever been seen running (+ a flush when a new one is added to it)? I'm worried about the potential performance impact for a case when a huge host is running a lot of small VMs in 'partitioning' mode (i.e. when all vCPUs are pinned). Additionally, this may have a negative impact on RT use-cases where each unnecessary interruption can be seen problematic. > +} > + > static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant, > u8 level) > { > @@ -3852,7 +3883,8 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, > gfn_t gfn, int quadrant, > WARN_ON_ONCE(role.direct && role.has_4_byte_gpte); > > sp = kvm_mmu_get_shadow_page(vcpu, gfn, role); > - ++sp->root_count; > + if (!sp->root_count++) > + kvm_mmu_flush_all_tlbs_root(vcpu->kvm, sp); > > return __pa(sp->spt); > } > @@ -5961,15 +5993,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) > kvm_mmu_sync_roots(vcpu); > > kvm_mmu_load_pgd(vcpu); > - > - /* > - * Flush any TLB entries for the new root, the provenance of the root > - * is unknown. Even if KVM ensures there are no stale TLB entries > - * for a freed root, in theory another hypervisor could have left > - * stale entries. Flushing on alloc also allows KVM to skip the TLB > - * flush when freeing a root (see kvm_tdp_mmu_put_root()). > - */ > - kvm_x86_call(flush_tlb_current)(vcpu); > out: > return r; > } > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h > index 65f3c89d7c5d..3cbf0d612f5e 100644 > --- a/arch/x86/kvm/mmu/mmu_internal.h > +++ b/arch/x86/kvm/mmu/mmu_internal.h > @@ -167,6 +167,8 @@ static inline bool is_mirror_sp(const struct kvm_mmu_page > *sp) > return sp->role.is_mirror; > } > > +void kvm_mmu_flush_all_tlbs_root(struct kvm *kvm, struct kvm_mmu_page *root); > + > static inline void kvm_mmu_alloc_external_spt(struct kvm_vcpu *vcpu, struct > kvm_mmu_page *sp) > { > /* > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > index 7f3d7229b2c1..3ff36d09b4fa 100644 > --- a/arch/x86/kvm/mmu/tdp_mmu.c > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > @@ -302,6 +302,7 @@ void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool > mirror) > */ > refcount_set(&root->tdp_mmu_root_count, 2); > list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); > + kvm_mmu_flush_all_tlbs_root(vcpu->kvm, root); > > out_spin_unlock: > spin_unlock(&kvm->arch.tdp_mmu_pages_lock); > -- Vitaly