Re: [PATCH] KVM: Boost vCPU candidiate in user mode which is delivering interrupt
On Tue, 20 Apr 2021 at 18:23, Paolo Bonzini wrote: > > On 20/04/21 10:48, Wanpeng Li wrote: > >> I was thinking of something simpler: > >> > >> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > >> index 9b8e30dd5b9b..455c648f9adc 100644 > >> --- a/virt/kvm/kvm_main.c > >> +++ b/virt/kvm/kvm_main.c > >> @@ -3198,10 +3198,9 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool > >> yield_to_kernel_mode) > >>{ > >> struct kvm *kvm = me->kvm; > >> struct kvm_vcpu *vcpu; > >> - int last_boosted_vcpu = me->kvm->last_boosted_vcpu; > >> int yielded = 0; > >> int try = 3; > >> - int pass; > >> + int pass, num_passes = 1; > >> int i; > >> > >> kvm_vcpu_set_in_spin_loop(me, true); > >> @@ -3212,13 +3211,14 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool > >> yield_to_kernel_mode) > >> * VCPU is holding the lock that we need and will release it. > >> * We approximate round-robin by starting at the last boosted > >> VCPU. > >> */ > >> - for (pass = 0; pass < 2 && !yielded && try; pass++) { > >> - kvm_for_each_vcpu(i, vcpu, kvm) { > >> - if (!pass && i <= last_boosted_vcpu) { > >> - i = last_boosted_vcpu; > >> - continue; > >> - } else if (pass && i > last_boosted_vcpu) > >> - break; > >> + for (pass = 0; pass < num_passes; pass++) { > >> + int idx = me->kvm->last_boosted_vcpu; > >> + int n = atomic_read(>online_vcpus); > >> + for (i = 0; i < n; i++, idx++) { > >> + if (idx == n) > >> + idx = 0; > >> + > >> + vcpu = kvm_get_vcpu(kvm, idx); > >> if (!READ_ONCE(vcpu->ready)) > >> continue; > >> if (vcpu == me) > >> @@ -3226,23 +3226,36 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool > >> yield_to_kernel_mode) > >> if (rcuwait_active(>wait) && > >> !vcpu_dy_runnable(vcpu)) > >> continue; > >> - if (READ_ONCE(vcpu->preempted) && > >> yield_to_kernel_mode && > >> - !kvm_arch_vcpu_in_kernel(vcpu)) > >> - continue; > >> if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) > >> continue; > >> > >> + if (READ_ONCE(vcpu->preempted) && > >> yield_to_kernel_mode && > >> + !kvm_arch_vcpu_in_kernel(vcpu)) { > >> + /* > >> +* A vCPU running in userspace can get to > >> kernel mode via > >> +* an interrupt. That's a worse choice than a > >> CPU already > >> +* in kernel mode so only do it on a second > >> pass. > >> +*/ > >> + if (!vcpu_dy_runnable(vcpu)) > >> + continue; > >> + if (pass == 0) { > >> + num_passes = 2; > >> + continue; > >> + } > >> + } > >> + > >> yielded = kvm_vcpu_yield_to(vcpu); > >> if (yielded > 0) { > >> kvm->last_boosted_vcpu = i; > >> - break; > >> + goto done; > >> } else if (yielded < 0) { > >> try--; > >> if (!try) > >> - break; > >> + goto done; > >> } > >> } > >> } > >> +done: > > > > We just tested the above post against 96 vCPUs VM in an over-subscribe > > scenario, the score of pbzip2 fluctuated drastically. Sometimes it is > > worse than vanilla, but the average improvement is around 2.2%. The > > new version of my post is around 9.3%,the origial posted patch is > > around 10% which is totally as expected since now both IPI receivers > > in user-mode and lock-waiters are second class citizens. > > Fair enough. Of the two patches you posted I prefer the original, so > I'll go with that one. Great! Thanks. :) Wanpeng
Re: [PATCH] KVM: Boost vCPU candidiate in user mode which is delivering interrupt
On Tue, 20 Apr 2021 at 15:23, Paolo Bonzini wrote: > > On 20/04/21 08:08, Wanpeng Li wrote: > > On Tue, 20 Apr 2021 at 14:02, Wanpeng Li wrote: > >> > >> On Tue, 20 Apr 2021 at 00:59, Paolo Bonzini wrote: > >>> > >>> On 19/04/21 18:32, Sean Christopherson wrote: > >>>> If false positives are a big concern, what about adding another pass to > >>>> the loop > >>>> and only yielding to usermode vCPUs with interrupts in the second full > >>>> pass? > >>>> I.e. give vCPUs that are already in kernel mode priority, and only yield > >>>> to > >>>> handle an interrupt if there are no vCPUs in kernel mode. > >>>> > >>>> kvm_arch_dy_runnable() pulls in pv_unhalted, which seems like a good > >>>> thing. > >>> > >>> pv_unhalted won't help if you're waiting for a kernel spinlock though, > >>> would it? Doing two passes (or looking for a "best" candidate that > >>> prefers kernel mode vCPUs to user mode vCPUs waiting for an interrupt) > >>> seems like the best choice overall. > >> > >> How about something like this: > > I was thinking of something simpler: > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index 9b8e30dd5b9b..455c648f9adc 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -3198,10 +3198,9 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool > yield_to_kernel_mode) > { > struct kvm *kvm = me->kvm; > struct kvm_vcpu *vcpu; > - int last_boosted_vcpu = me->kvm->last_boosted_vcpu; > int yielded = 0; > int try = 3; > - int pass; > + int pass, num_passes = 1; > int i; > > kvm_vcpu_set_in_spin_loop(me, true); > @@ -3212,13 +3211,14 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool > yield_to_kernel_mode) > * VCPU is holding the lock that we need and will release it. > * We approximate round-robin by starting at the last boosted VCPU. > */ > - for (pass = 0; pass < 2 && !yielded && try; pass++) { > - kvm_for_each_vcpu(i, vcpu, kvm) { > - if (!pass && i <= last_boosted_vcpu) { > - i = last_boosted_vcpu; > - continue; > - } else if (pass && i > last_boosted_vcpu) > - break; > + for (pass = 0; pass < num_passes; pass++) { > + int idx = me->kvm->last_boosted_vcpu; > + int n = atomic_read(>online_vcpus); > + for (i = 0; i < n; i++, idx++) { > + if (idx == n) > + idx = 0; > + > + vcpu = kvm_get_vcpu(kvm, idx); > if (!READ_ONCE(vcpu->ready)) > continue; > if (vcpu == me) > @@ -3226,23 +3226,36 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool > yield_to_kernel_mode) > if (rcuwait_active(>wait) && > !vcpu_dy_runnable(vcpu)) > continue; > - if (READ_ONCE(vcpu->preempted) && > yield_to_kernel_mode && > - !kvm_arch_vcpu_in_kernel(vcpu)) > - continue; > if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) > continue; > > + if (READ_ONCE(vcpu->preempted) && > yield_to_kernel_mode && > + !kvm_arch_vcpu_in_kernel(vcpu)) { > + /* > +* A vCPU running in userspace can get to kernel > mode via > +* an interrupt. That's a worse choice than a > CPU already > +* in kernel mode so only do it on a second pass. > +*/ > + if (!vcpu_dy_runnable(vcpu)) > + continue; > + if (pass == 0) { > + num_passes = 2; > + continue; > + } > + } > + > yielded = kvm_vcpu_yield_to(vcpu); > if (yielded > 0) { > kvm->last_boosted_vcpu = i; > -
Re: [PATCH] KVM: Boost vCPU candidiate in user mode which is delivering interrupt
On Tue, 20 Apr 2021 at 14:02, Wanpeng Li wrote: > > On Tue, 20 Apr 2021 at 00:59, Paolo Bonzini wrote: > > > > On 19/04/21 18:32, Sean Christopherson wrote: > > > If false positives are a big concern, what about adding another pass to > > > the loop > > > and only yielding to usermode vCPUs with interrupts in the second full > > > pass? > > > I.e. give vCPUs that are already in kernel mode priority, and only yield > > > to > > > handle an interrupt if there are no vCPUs in kernel mode. > > > > > > kvm_arch_dy_runnable() pulls in pv_unhalted, which seems like a good > > > thing. > > > > pv_unhalted won't help if you're waiting for a kernel spinlock though, > > would it? Doing two passes (or looking for a "best" candidate that > > prefers kernel mode vCPUs to user mode vCPUs waiting for an interrupt) > > seems like the best choice overall. > > How about something like this: Sorry, should be the codes below: diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6b4dd95..9bc5f87 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -325,10 +325,12 @@ struct kvm_vcpu { * Cpu relax intercept or pause loop exit optimization * in_spin_loop: set when a vcpu does a pause loop exit * or cpu relax intercepted. + * pending_interrupt: set when a vcpu waiting for an interrupt * dy_eligible: indicates whether vcpu is eligible for directed yield. */ struct { bool in_spin_loop; +bool pending_interrupt; bool dy_eligible; } spin_loop; #endif @@ -1427,6 +1429,12 @@ static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val) { vcpu->spin_loop.in_spin_loop = val; } + +static inline void kvm_vcpu_set_pending_interrupt(struct kvm_vcpu *vcpu, bool val) +{ +vcpu->spin_loop.pending_interrupt = val; +} + static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val) { vcpu->spin_loop.dy_eligible = val; @@ -1438,6 +1446,10 @@ static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val) { } +static inline void kvm_vcpu_set_pending_interrupt(struct kvm_vcpu *vcpu, bool val) +{ +} + static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val) { } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 529cff1..bf6f1ec 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -410,6 +410,7 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) INIT_LIST_HEAD(>blocked_vcpu_list); kvm_vcpu_set_in_spin_loop(vcpu, false); +kvm_vcpu_set_pending_interrupt(vcpu, false); kvm_vcpu_set_dy_eligible(vcpu, false); vcpu->preempted = false; vcpu->ready = false; @@ -3079,14 +3080,17 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); * Helper that checks whether a VCPU is eligible for directed yield. * Most eligible candidate to yield is decided by following heuristics: * - * (a) VCPU which has not done pl-exit or cpu relax intercepted recently - * (preempted lock holder), indicated by @in_spin_loop. - * Set at the beginning and cleared at the end of interception/PLE handler. + * (a) VCPU which has not done pl-exit or cpu relax intercepted and is not + * waiting for an interrupt recently (preempted lock holder). The former + * one is indicated by @in_spin_loop, set at the beginning and cleared at + * the end of interception/PLE handler. The later one is indicated by + * @pending_interrupt, set when interrupt is delivering and cleared at + * the end of directed yield. * - * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get - * chance last time (mostly it has become eligible now since we have probably - * yielded to lockholder in last iteration. This is done by toggling - * @dy_eligible each time a VCPU checked for eligibility.) + * (b) VCPU which has done pl-exit/ cpu relax intercepted or is waiting for + * interrupt but did not get chance last time (mostly it has become eligible + * now since we have probably yielded to lockholder in last iteration. This + * is done by toggling @dy_eligible each time a VCPU checked for eligibility.) * * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding * to preempted lock-holder could result in wrong VCPU selection and CPU @@ -3102,10 +3106,10 @@ static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT bool eligible; -eligible = !vcpu->spin_loop.in_spin_loop || +eligible = !(vcpu->spin_loop.in_spin_loop || vcpu->spin_loop.pending_interrupt) || vcpu->spin_loop.dy_eligible; -if (vcpu->spin_loop.in_spin_loop) +if (vcpu->spin_loop.in_spin_loop || vcpu->spin_loop.pending_interrupt) kv
Re: [PATCH] KVM: Boost vCPU candidiate in user mode which is delivering interrupt
On Tue, 20 Apr 2021 at 00:59, Paolo Bonzini wrote: > > On 19/04/21 18:32, Sean Christopherson wrote: > > If false positives are a big concern, what about adding another pass to the > > loop > > and only yielding to usermode vCPUs with interrupts in the second full pass? > > I.e. give vCPUs that are already in kernel mode priority, and only yield to > > handle an interrupt if there are no vCPUs in kernel mode. > > > > kvm_arch_dy_runnable() pulls in pv_unhalted, which seems like a good thing. > > pv_unhalted won't help if you're waiting for a kernel spinlock though, > would it? Doing two passes (or looking for a "best" candidate that > prefers kernel mode vCPUs to user mode vCPUs waiting for an interrupt) > seems like the best choice overall. How about something like this: diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6b4dd95..8ba50be 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -325,10 +325,12 @@ struct kvm_vcpu { * Cpu relax intercept or pause loop exit optimization * in_spin_loop: set when a vcpu does a pause loop exit * or cpu relax intercepted. + * pending_interrupt: set when a vcpu waiting for an interrupt * dy_eligible: indicates whether vcpu is eligible for directed yield. */ struct { bool in_spin_loop; +bool pending_interrupt; bool dy_eligible; } spin_loop; #endif @@ -1427,6 +1429,12 @@ static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val) { vcpu->spin_loop.in_spin_loop = val; } + +static inline void kvm_vcpu_set_pending_interrupt(struct kvm_vcpu *vcpu, bool val) +{ +vcpu->spin_loop.pending__interrupt = val; +} + static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val) { vcpu->spin_loop.dy_eligible = val; @@ -1438,6 +1446,10 @@ static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val) { } +static inline void kvm_vcpu_set_pending_interrupt(struct kvm_vcpu *vcpu, bool val) +{ +} + static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val) { } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 529cff1..42e0255 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -410,6 +410,7 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) INIT_LIST_HEAD(>blocked_vcpu_list); kvm_vcpu_set_in_spin_loop(vcpu, false); +kvm_vcpu_set_pending_interrupt(vcpu, false); kvm_vcpu_set_dy_eligible(vcpu, false); vcpu->preempted = false; vcpu->ready = false; @@ -3079,14 +3080,17 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); * Helper that checks whether a VCPU is eligible for directed yield. * Most eligible candidate to yield is decided by following heuristics: * - * (a) VCPU which has not done pl-exit or cpu relax intercepted recently - * (preempted lock holder), indicated by @in_spin_loop. - * Set at the beginning and cleared at the end of interception/PLE handler. + * (a) VCPU which has not done pl-exit or cpu relax intercepted and is not + * waiting for an interrupt recently (preempted lock holder). The former + * one is indicated by @in_spin_loop, set at the beginning and cleared at + * the end of interception/PLE handler. The later one is indicated by + * @pending_interrupt, set when interrupt is delivering and cleared at + * the end of directed yield. * - * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get - * chance last time (mostly it has become eligible now since we have probably - * yielded to lockholder in last iteration. This is done by toggling - * @dy_eligible each time a VCPU checked for eligibility.) + * (b) VCPU which has done pl-exit/ cpu relax intercepted or is waiting for + * interrupt but did not get chance last time (mostly it has become eligible + * now since we have probably yielded to lockholder in last iteration. This + * is done by toggling @dy_eligible each time a VCPU checked for eligibility.) * * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding * to preempted lock-holder could result in wrong VCPU selection and CPU @@ -3102,10 +3106,10 @@ static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT bool eligible; -eligible = !vcpu->spin_loop.in_spin_loop || +eligible = !(vcpu->spin_loop.in_spin_loop || vcpu->spin_loop.has_interrupt) || vcpu->spin_loop.dy_eligible; -if (vcpu->spin_loop.in_spin_loop) +if (vcpu->spin_loop.in_spin_loop || vcpu->spin_loop.has_interrupt) kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); return eligible; @@ -3137,6 +3141,16 @@ static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu) return false; } +static bool kvm_has_interrupt_delivery(struct kvm_vcpu *vcpu) +{ +if (vcpu_dy_runnable(vcpu)) { +kvm_vcpu_set_pending_interrupt(vcpu, true); +return true;
Re: [PATCH v2 09/10] KVM: Don't take mmu_lock for range invalidation unless necessary
On Fri, 2 Apr 2021 at 08:59, Sean Christopherson wrote: > > Avoid taking mmu_lock for unrelated .invalidate_range_{start,end}() > notifications. Because mmu_notifier_count must be modified while holding > mmu_lock for write, and must always be paired across start->end to stay > balanced, lock elision must happen in both or none. To meet that > requirement, add a rwsem to prevent memslot updates across range_start() > and range_end(). > > Use a rwsem instead of a rwlock since most notifiers _allow_ blocking, > and the lock will be endl across the entire start() ... end() sequence. > If anything in the sequence sleeps, including the caller or a different > notifier, holding the spinlock would be disastrous. > > For notifiers that _disallow_ blocking, e.g. OOM reaping, simply go down > the slow path of unconditionally acquiring mmu_lock. The sane > alternative would be to try to acquire the lock and force the notifier > to retry on failure. But since OOM is currently the _only_ scenario > where blocking is disallowed attempting to optimize a guest that has been > marked for death is pointless. > > Unconditionally define and use mmu_notifier_slots_lock in the memslots > code, purely to avoid more #ifdefs. The overhead of acquiring the lock > is negligible when the lock is uncontested, which will always be the case > when the MMU notifiers are not used. > > Note, technically flag-only memslot updates could be allowed in parallel, > but stalling a memslot update for a relatively short amount of time is > not a scalability issue, and this is all more than complex enough. > > Based heavily on code from Ben Gardon. > > Suggested-by: Ben Gardon > Signed-off-by: Sean Christopherson I saw this splatting: == WARNING: possible circular locking dependency detected 5.12.0-rc3+ #6 Tainted: G OE -- qemu-system-x86/3069 is trying to acquire lock: 9c775ca0 (mmu_notifier_invalidate_range_start){+.+.}-{0:0}, at: __mmu_notifier_invalidate_range_end+0x5/0x190 but task is already holding lock: aff7410a9160 (>mmu_notifier_slots_lock){.+.+}-{3:3}, at: kvm_mmu_notifier_invalidate_range_start+0x36d/0x4f0 [kvm] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (>mmu_notifier_slots_lock){.+.+}-{3:3}: down_read+0x48/0x250 kvm_mmu_notifier_invalidate_range_start+0x36d/0x4f0 [kvm] __mmu_notifier_invalidate_range_start+0xe8/0x260 wp_page_copy+0x82b/0xa30 do_wp_page+0xde/0x420 __handle_mm_fault+0x935/0x1230 handle_mm_fault+0x179/0x420 do_user_addr_fault+0x1b3/0x690 exc_page_fault+0x82/0x2b0 asm_exc_page_fault+0x1e/0x30 -> #0 (mmu_notifier_invalidate_range_start){+.+.}-{0:0}: __lock_acquire+0x110f/0x1980 lock_acquire+0x1bc/0x400 __mmu_notifier_invalidate_range_end+0x47/0x190 wp_page_copy+0x796/0xa30 do_wp_page+0xde/0x420 __handle_mm_fault+0x935/0x1230 handle_mm_fault+0x179/0x420 do_user_addr_fault+0x1b3/0x690 exc_page_fault+0x82/0x2b0 asm_exc_page_fault+0x1e/0x30 other info that might help us debug this: Possible unsafe locking scenario: CPU0CPU1 lock(>mmu_notifier_slots_lock); lock(mmu_notifier_invalidate_range_start); lock(>mmu_notifier_slots_lock); lock(mmu_notifier_invalidate_range_start); *** DEADLOCK *** 2 locks held by qemu-system-x86/3069: #0: 9e4269f8a9e0 (>mmap_lock#2){}-{3:3}, at: do_user_addr_fault+0x10e/0x690 #1: aff7410a9160 (>mmu_notifier_slots_lock){.+.+}-{3:3}, at: kvm_mmu_notifier_invalidate_range_start+0x36d/0x4f0 [kvm] stack backtrace: CPU: 0 PID: 3069 Comm: qemu-system-x86 Tainted: G OE 5.12.0-rc3+ #6 Hardware name: LENOVO ThinkCentre M8500t-N000/SHARKBAY, BIOS FBKTC1AUS 02/16/2016 Call Trace: dump_stack+0x87/0xb7 print_circular_bug.isra.39+0x1b4/0x210 check_noncircular+0x103/0x150 __lock_acquire+0x110f/0x1980 ? __lock_acquire+0x110f/0x1980 lock_acquire+0x1bc/0x400 ? __mmu_notifier_invalidate_range_end+0x5/0x190 ? find_held_lock+0x40/0xb0 __mmu_notifier_invalidate_range_end+0x47/0x190 ? __mmu_notifier_invalidate_range_end+0x5/0x190 wp_page_copy+0x796/0xa30 do_wp_page+0xde/0x420 __handle_mm_fault+0x935/0x1230 handle_mm_fault+0x179/0x420 do_user_addr_fault+0x1b3/0x690 ? rcu_read_lock_sched_held+0x4f/0x80 exc_page_fault+0x82/0x2b0 ? asm_exc_page_fault+0x8/0x30 asm_exc_page_fault+0x1e/0x30 RIP: 0033:0x55f5bef2560f
Re: [PATCH] KVM: Boost vCPU candidiate in user mode which is delivering interrupt
On Sat, 17 Apr 2021 at 21:09, Paolo Bonzini wrote: > > On 16/04/21 05:08, Wanpeng Li wrote: > > From: Wanpeng Li > > > > Both lock holder vCPU and IPI receiver that has halted are condidate for > > boost. However, the PLE handler was originally designed to deal with the > > lock holder preemption problem. The Intel PLE occurs when the spinlock > > waiter is in kernel mode. This assumption doesn't hold for IPI receiver, > > they can be in either kernel or user mode. the vCPU candidate in user mode > > will not be boosted even if they should respond to IPIs. Some benchmarks > > like pbzip2, swaptions etc do the TLB shootdown in kernel mode and most > > of the time they are running in user mode. It can lead to a large number > > of continuous PLE events because the IPI sender causes PLE events > > repeatedly until the receiver is scheduled while the receiver is not > > candidate for a boost. > > > > This patch boosts the vCPU candidiate in user mode which is delivery > > interrupt. We can observe the speed of pbzip2 improves 10% in 96 vCPUs > > VM in over-subscribe scenario (The host machine is 2 socket, 48 cores, > > 96 HTs Intel CLX box). There is no performance regression for other > > benchmarks like Unixbench spawn (most of the time contend read/write > > lock in kernel mode), ebizzy (most of the time contend read/write sem > > and TLB shoodtdown in kernel mode). > > > > +bool kvm_arch_interrupt_delivery(struct kvm_vcpu *vcpu) > > +{ > > + if (vcpu->arch.apicv_active && > > static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu)) > > + return true; > > + > > + return false; > > +} > > Can you reuse vcpu_dy_runnable instead of this new function? I have some concerns. For x86 arch, vcpu_dy_runnable() will add extra vCPU candidates by KVM_REQ_EVENT and async pf(which has already opportunistically made the guest do other stuff). For other arches, kvm_arch_dy_runnale() is equal to kvm_arch_vcpu_runnable() except powerpc which has too many events and is not conservative. In general, vcpu_dy_runnable() will loose the conditions and add more vCPU candidates. Wanpeng
[PATCH] KVM: Boost vCPU candidiate in user mode which is delivering interrupt
From: Wanpeng Li Both lock holder vCPU and IPI receiver that has halted are condidate for boost. However, the PLE handler was originally designed to deal with the lock holder preemption problem. The Intel PLE occurs when the spinlock waiter is in kernel mode. This assumption doesn't hold for IPI receiver, they can be in either kernel or user mode. the vCPU candidate in user mode will not be boosted even if they should respond to IPIs. Some benchmarks like pbzip2, swaptions etc do the TLB shootdown in kernel mode and most of the time they are running in user mode. It can lead to a large number of continuous PLE events because the IPI sender causes PLE events repeatedly until the receiver is scheduled while the receiver is not candidate for a boost. This patch boosts the vCPU candidiate in user mode which is delivery interrupt. We can observe the speed of pbzip2 improves 10% in 96 vCPUs VM in over-subscribe scenario (The host machine is 2 socket, 48 cores, 96 HTs Intel CLX box). There is no performance regression for other benchmarks like Unixbench spawn (most of the time contend read/write lock in kernel mode), ebizzy (most of the time contend read/write sem and TLB shoodtdown in kernel mode). Signed-off-by: Wanpeng Li --- arch/x86/kvm/x86.c | 8 include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 6 ++ 3 files changed, 15 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0d2dd3f..0f16fa5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11069,6 +11069,14 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) return false; } +bool kvm_arch_interrupt_delivery(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu)) + return true; + + return false; +} + bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) { return vcpu->arch.preempted_in_kernel; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3b06d12..5012fc4 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -954,6 +954,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu); +bool kvm_arch_interrupt_delivery(struct kvm_vcpu *vcpu); int kvm_arch_post_init_vm(struct kvm *kvm); void kvm_arch_pre_destroy_vm(struct kvm *kvm); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0a481e7..781d2db 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3012,6 +3012,11 @@ static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu) return false; } +bool __weak kvm_arch_interrupt_delivery(struct kvm_vcpu *vcpu) +{ + return false; +} + void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) { struct kvm *kvm = me->kvm; @@ -3045,6 +3050,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) !vcpu_dy_runnable(vcpu)) continue; if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode && + !kvm_arch_interrupt_delivery(vcpu) && !kvm_arch_vcpu_in_kernel(vcpu)) continue; if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) -- 2.7.4
Re: [PATCH v2 0/3] KVM: Properly account for guest CPU time
On Thu, 15 Apr 2021 at 08:49, Sean Christopherson wrote: > > On Wed, Apr 14, 2021, Wanpeng Li wrote: > > On Wed, 14 Apr 2021 at 01:25, Sean Christopherson wrote: > > > > > > On Tue, Apr 13, 2021, Wanpeng Li wrote: > > > > The bugzilla https://bugzilla.kernel.org/show_bug.cgi?id=209831 > > > > reported that the guest time remains 0 when running a while true > > > > loop in the guest. > > > > > > > > The commit 87fa7f3e98a131 ("x86/kvm: Move context tracking where it > > > > belongs") moves guest_exit_irqoff() close to vmexit breaks the > > > > tick-based time accouting when the ticks that happen after IRQs are > > > > disabled are incorrectly accounted to the host/system time. This is > > > > because we exit the guest state too early. > > > > > > > > This patchset splits both context tracking logic and the time accounting > > > > logic from guest_enter/exit_irqoff(), keep context tracking around the > > > > actual vmentry/exit code, have the virt time specific helpers which > > > > can be placed at the proper spots in kvm. In addition, it will not > > > > break the world outside of x86. > > > > > > IMO, this is going in the wrong direction. Rather than separate context > > > tracking, > > > vtime accounting, and KVM logic, this further intertwines the three. > > > E.g. the > > > context tracking code has even more vtime accounting NATIVE vs. GEN vs. > > > TICK > > > logic baked into it. > > > > > > Rather than smush everything into context_tracking.h, I think we can > > > cleanly > > > split the context tracking and vtime accounting code into separate > > > pieces, which > > > will in turn allow moving the wrapping logic to linux/kvm_host.h. Once > > > that is > > > done, splitting the context tracking and time accounting logic for KVM x86 > > > becomes a KVM detail as opposed to requiring dedicated logic in the > > > context > > > tracking code. > > > > > > I have untested code that compiles on x86, I'll send an RFC shortly. > > > > We need an easy to backport fix and then we might have some further > > cleanups on top. > > I fiddled with this a bit today, I think I have something workable that will > be > a relatively clean and short backport. With luck, I'll get it posted > tomorrow. I think we should improve my posted version instead of posting a lot of alternative versions to save everybody's time. Wanpeng
Re: [PATCH v2 0/3] KVM: Properly account for guest CPU time
On Wed, 14 Apr 2021 at 01:25, Sean Christopherson wrote: > > On Tue, Apr 13, 2021, Wanpeng Li wrote: > > The bugzilla https://bugzilla.kernel.org/show_bug.cgi?id=209831 > > reported that the guest time remains 0 when running a while true > > loop in the guest. > > > > The commit 87fa7f3e98a131 ("x86/kvm: Move context tracking where it > > belongs") moves guest_exit_irqoff() close to vmexit breaks the > > tick-based time accouting when the ticks that happen after IRQs are > > disabled are incorrectly accounted to the host/system time. This is > > because we exit the guest state too early. > > > > This patchset splits both context tracking logic and the time accounting > > logic from guest_enter/exit_irqoff(), keep context tracking around the > > actual vmentry/exit code, have the virt time specific helpers which > > can be placed at the proper spots in kvm. In addition, it will not > > break the world outside of x86. > > IMO, this is going in the wrong direction. Rather than separate context > tracking, > vtime accounting, and KVM logic, this further intertwines the three. E.g. the > context tracking code has even more vtime accounting NATIVE vs. GEN vs. TICK > logic baked into it. > > Rather than smush everything into context_tracking.h, I think we can cleanly > split the context tracking and vtime accounting code into separate pieces, > which > will in turn allow moving the wrapping logic to linux/kvm_host.h. Once that > is > done, splitting the context tracking and time accounting logic for KVM x86 > becomes a KVM detail as opposed to requiring dedicated logic in the context > tracking code. > > I have untested code that compiles on x86, I'll send an RFC shortly. We need an easy to backport fix and then we might have some further cleanups on top. Wanpeng
Re: [PATCH v2 1/3] context_tracking: Split guest_enter/exit_irqoff
On Tue, 13 Apr 2021 at 15:48, Christian Borntraeger wrote: > > > > On 13.04.21 09:38, Wanpeng Li wrote: > > On Tue, 13 Apr 2021 at 15:35, Christian Borntraeger > > wrote: > >> > >> > >> > >> On 13.04.21 09:16, Wanpeng Li wrote: > >> [...] > >> > >>> @@ -145,6 +155,13 @@ static __always_inline void guest_exit_irqoff(void) > >>>} > >>> > >>>#else > THis is the else part > > > >>> +static __always_inline void context_guest_enter_irqoff(void) > >>> +{ > >>> + instrumentation_begin(); > > 2nd on > >>> + rcu_virt_note_context_switch(smp_processor_id()); > >>> + instrumentation_end(); > 2nd off > >>> +} > >>> + > >>>static __always_inline void guest_enter_irqoff(void) > >>>{ > >>>/* > >>> @@ -155,10 +172,13 @@ static __always_inline void guest_enter_irqoff(void) > >>>instrumentation_begin(); > > first on > >>>vtime_account_kernel(current); > >>>current->flags |= PF_VCPU; > >>> - rcu_virt_note_context_switch(smp_processor_id()); > >>>instrumentation_end(); > > first off > >>> + > >>> + context_guest_enter_irqoff(); > here we call the 2nd on and off. > >> > >> So we now do instrumentation_begin 2 times? > > > > Similar to context_guest_enter_irqoff() ifdef > > CONFIG_VIRT_CPU_ACCOUNTING_GEN. > > For the > ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN part > context_guest_enter_irqoff() > does not have instrumentation_begin/end. > > Or did I miss anything. I mean the if (!context_tracking_enabled_this_cpu()) part in the function context_guest_enter_irqoff() ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN. :) Wanpeng
Re: [PATCH v2 1/3] context_tracking: Split guest_enter/exit_irqoff
On Tue, 13 Apr 2021 at 15:35, Christian Borntraeger wrote: > > > > On 13.04.21 09:16, Wanpeng Li wrote: > [...] > > > @@ -145,6 +155,13 @@ static __always_inline void guest_exit_irqoff(void) > > } > > > > #else > > +static __always_inline void context_guest_enter_irqoff(void) > > +{ > > + instrumentation_begin(); > > + rcu_virt_note_context_switch(smp_processor_id()); > > + instrumentation_end(); > > +} > > + > > static __always_inline void guest_enter_irqoff(void) > > { > > /* > > @@ -155,10 +172,13 @@ static __always_inline void guest_enter_irqoff(void) > > instrumentation_begin(); > > vtime_account_kernel(current); > > current->flags |= PF_VCPU; > > - rcu_virt_note_context_switch(smp_processor_id()); > > instrumentation_end(); > > + > > + context_guest_enter_irqoff(); > > So we now do instrumentation_begin 2 times? Similar to context_guest_enter_irqoff() ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN. Wanpeng
[PATCH v2 3/3] x86/kvm: Fix vtime accounting
From: Wanpeng Li The bugzilla https://bugzilla.kernel.org/show_bug.cgi?id=209831 reported that the guest time remains 0 when running a while true loop in the guest. The commit 87fa7f3e98a131 ("x86/kvm: Move context tracking where it belongs") moves guest_exit_irqoff() close to vmexit breaks the tick-based time accouting when the ticks that happen after IRQs are disabled are incorrectly accounted to the host/system time. This is because we exit the guest state too early. Keep context tracking around the actual vmentry/exit code, the time accounting logic is separated out by preposed patch from guest_enter/exit_irqoff(). Keep vtime-based time accounting around the actual vmentry/exit code when vtime_accounting_enabled_this_cpu() is true, leave PF_VCPU clearing after handle_exit_irqoff() and explicit IRQ window for tick-based time accouting. Fixes: 87fa7f3e98a131 ("x86/kvm: Move context tracking where it belongs") Cc: Thomas Gleixner Cc: Sean Christopherson Cc: Michael Tokarev Cc: sta...@vger.kernel.org#v5.9-rc1+ Suggested-by: Thomas Gleixner Signed-off-by: Wanpeng Li --- arch/x86/kvm/svm/svm.c | 6 -- arch/x86/kvm/vmx/vmx.c | 6 -- arch/x86/kvm/x86.c | 1 + 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 48b396f3..2a4e284 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3726,11 +3726,12 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) * accordingly. */ instrumentation_begin(); + vtime_account_guest_enter(); trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(CALLER_ADDR0); instrumentation_end(); - guest_enter_irqoff(); + context_guest_enter_irqoff(); lockdep_hardirqs_on(CALLER_ADDR0); if (sev_es_guest(vcpu->kvm)) { @@ -3758,10 +3759,11 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) * into world and some more. */ lockdep_hardirqs_off(CALLER_ADDR0); - guest_exit_irqoff(); + context_guest_exit_irqoff(); instrumentation_begin(); trace_hardirqs_off_finish(); + __vtime_account_guest_exit(); instrumentation_end(); } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c05e6e2..23be956 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6613,11 +6613,12 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, * accordingly. */ instrumentation_begin(); + vtime_account_guest_enter(); trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(CALLER_ADDR0); instrumentation_end(); - guest_enter_irqoff(); + context_guest_enter_irqoff(); lockdep_hardirqs_on(CALLER_ADDR0); /* L1D Flush includes CPU buffer clear to mitigate MDS */ @@ -6647,10 +6648,11 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, * into world and some more. */ lockdep_hardirqs_off(CALLER_ADDR0); - guest_exit_irqoff(); + context_guest_exit_irqoff(); instrumentation_begin(); trace_hardirqs_off_finish(); + __vtime_account_guest_exit(); instrumentation_end(); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ce9a1d2..0d2dd3f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9245,6 +9245,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) ++vcpu->stat.exits; local_irq_disable(); kvm_after_interrupt(vcpu); + vcpu_account_guest_exit(); if (lapic_in_kernel(vcpu)) { s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta; -- 2.7.4
[PATCH v2 1/3] context_tracking: Split guest_enter/exit_irqoff
From: Wanpeng Li Split context_tracking part from guest_enter/exit_irqoff, it will be called separately in later patches. Suggested-by: Thomas Gleixner Cc: Thomas Gleixner Cc: Sean Christopherson Cc: Michael Tokarev Signed-off-by: Wanpeng Li --- include/linux/context_tracking.h | 42 +--- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index bceb064..d8ad844 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -104,16 +104,8 @@ static inline void context_tracking_init(void) { } #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -/* must be called with irqs disabled */ -static __always_inline void guest_enter_irqoff(void) +static __always_inline void context_guest_enter_irqoff(void) { - instrumentation_begin(); - if (vtime_accounting_enabled_this_cpu()) - vtime_guest_enter(current); - else - current->flags |= PF_VCPU; - instrumentation_end(); - if (context_tracking_enabled()) __context_tracking_enter(CONTEXT_GUEST); @@ -131,10 +123,28 @@ static __always_inline void guest_enter_irqoff(void) } } -static __always_inline void guest_exit_irqoff(void) +/* must be called with irqs disabled */ +static __always_inline void guest_enter_irqoff(void) +{ + instrumentation_begin(); + if (vtime_accounting_enabled_this_cpu()) + vtime_guest_enter(current); + else + current->flags |= PF_VCPU; + instrumentation_end(); + + context_guest_enter_irqoff(); +} + +static __always_inline void context_guest_exit_irqoff(void) { if (context_tracking_enabled()) __context_tracking_exit(CONTEXT_GUEST); +} + +static __always_inline void guest_exit_irqoff(void) +{ + context_guest_exit_irqoff(); instrumentation_begin(); if (vtime_accounting_enabled_this_cpu()) @@ -145,6 +155,13 @@ static __always_inline void guest_exit_irqoff(void) } #else +static __always_inline void context_guest_enter_irqoff(void) +{ + instrumentation_begin(); + rcu_virt_note_context_switch(smp_processor_id()); + instrumentation_end(); +} + static __always_inline void guest_enter_irqoff(void) { /* @@ -155,10 +172,13 @@ static __always_inline void guest_enter_irqoff(void) instrumentation_begin(); vtime_account_kernel(current); current->flags |= PF_VCPU; - rcu_virt_note_context_switch(smp_processor_id()); instrumentation_end(); + + context_guest_enter_irqoff(); } +static __always_inline void context_guest_exit_irqoff(void) { } + static __always_inline void guest_exit_irqoff(void) { instrumentation_begin(); -- 2.7.4
[PATCH v2 0/3] KVM: Properly account for guest CPU time
The bugzilla https://bugzilla.kernel.org/show_bug.cgi?id=209831 reported that the guest time remains 0 when running a while true loop in the guest. The commit 87fa7f3e98a131 ("x86/kvm: Move context tracking where it belongs") moves guest_exit_irqoff() close to vmexit breaks the tick-based time accouting when the ticks that happen after IRQs are disabled are incorrectly accounted to the host/system time. This is because we exit the guest state too early. This patchset splits both context tracking logic and the time accounting logic from guest_enter/exit_irqoff(), keep context tracking around the actual vmentry/exit code, have the virt time specific helpers which can be placed at the proper spots in kvm. In addition, it will not break the world outside of x86. v1 -> v2: * split context_tracking from guest_enter/exit_irqoff * provide separate vtime accounting functions for consistent * place the virt time specific helpers at the proper splot Suggested-by: Thomas Gleixner Cc: Thomas Gleixner Cc: Sean Christopherson Cc: Michael Tokarev Wanpeng Li (3): context_tracking: Split guest_enter/exit_irqoff context_tracking: Provide separate vtime accounting functions x86/kvm: Fix vtime accounting arch/x86/kvm/svm/svm.c | 6 ++- arch/x86/kvm/vmx/vmx.c | 6 ++- arch/x86/kvm/x86.c | 1 + include/linux/context_tracking.h | 84 +++- 4 files changed, 74 insertions(+), 23 deletions(-) -- 2.7.4
[PATCH v2 2/3] context_tracking: Provide separate vtime accounting functions
From: Wanpeng Li Provide separate vtime accounting functions, because having proper wrappers for that case would be too consistent and less confusing. Suggested-by: Thomas Gleixner Cc: Thomas Gleixner Cc: Sean Christopherson Cc: Michael Tokarev Signed-off-by: Wanpeng Li --- include/linux/context_tracking.h | 50 ++-- 1 file changed, 38 insertions(+), 12 deletions(-) diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index d8ad844..491f889 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -102,6 +102,40 @@ extern void context_tracking_init(void); static inline void context_tracking_init(void) { } #endif /* CONFIG_CONTEXT_TRACKING_FORCE */ +static __always_inline void vtime_account_guest_enter(void) +{ + if (IS_ENABLED(CONFIG_VIRT_CPU_ACCOUNTING_GEN)) { + if (vtime_accounting_enabled_this_cpu()) + vtime_guest_enter(current); + else + current->flags |= PF_VCPU; + } else { + vtime_account_kernel(current); + current->flags |= PF_VCPU; + } +} + +static __always_inline void __vtime_account_guest_exit(void) +{ + if (IS_ENABLED(CONFIG_VIRT_CPU_ACCOUNTING_GEN)) { + if (vtime_accounting_enabled_this_cpu()) + vtime_guest_exit(current); + } else { + vtime_account_kernel(current); + } +} + +static __always_inline void vtime_account_guest_exit(void) +{ + __vtime_account_guest_exit(); + current->flags &= ~PF_VCPU; +} + +static __always_inline void vcpu_account_guest_exit(void) +{ + if (!vtime_accounting_enabled_this_cpu()) + current->flags &= ~PF_VCPU; +} #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN static __always_inline void context_guest_enter_irqoff(void) @@ -127,10 +161,7 @@ static __always_inline void context_guest_enter_irqoff(void) static __always_inline void guest_enter_irqoff(void) { instrumentation_begin(); - if (vtime_accounting_enabled_this_cpu()) - vtime_guest_enter(current); - else - current->flags |= PF_VCPU; + vtime_account_guest_enter(); instrumentation_end(); context_guest_enter_irqoff(); @@ -147,10 +178,7 @@ static __always_inline void guest_exit_irqoff(void) context_guest_exit_irqoff(); instrumentation_begin(); - if (vtime_accounting_enabled_this_cpu()) - vtime_guest_exit(current); - else - current->flags &= ~PF_VCPU; + vtime_account_guest_exit(); instrumentation_end(); } @@ -170,8 +198,7 @@ static __always_inline void guest_enter_irqoff(void) * to flush. */ instrumentation_begin(); - vtime_account_kernel(current); - current->flags |= PF_VCPU; + vtime_account_guest_enter(); instrumentation_end(); context_guest_enter_irqoff(); @@ -183,8 +210,7 @@ static __always_inline void guest_exit_irqoff(void) { instrumentation_begin(); /* Flush the guest cputime we spent on the guest */ - vtime_account_kernel(current); - current->flags &= ~PF_VCPU; + vtime_account_guest_exit(); instrumentation_end(); } #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ -- 2.7.4
[PATCH v2 3/3] KVM: X86: Do not yield to self
From: Wanpeng Li If the target is self we do not need to yield, we can avoid malicious guest to play this. Signed-off-by: Wanpeng Li --- v1 -> v2: * update comments arch/x86/kvm/x86.c | 4 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f08e9b4..ce9a1d2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8231,6 +8231,10 @@ static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id) if (!target || !READ_ONCE(target->ready)) goto no_yield; + /* Ignore requests to yield to self */ + if (vcpu == target) + goto no_yield; + if (kvm_vcpu_yield_to(target) <= 0) goto no_yield; -- 2.7.4
[PATCH v2 2/3] KVM: X86: Count attempted/successful directed yield
From: Wanpeng Li To analyze some performance issues with lock contention and scheduling, it is nice to know when directed yield are successful or failing. Signed-off-by: Wanpeng Li --- v1 -> v2: * rename new vcpu stat * account success instead of ignore arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/x86.c | 24 ++-- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 44f8930..5af7411 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1126,6 +1126,8 @@ struct kvm_vcpu_stat { u64 halt_poll_success_ns; u64 halt_poll_fail_ns; u64 nested_run; + u64 directed_yield_attempted; + u64 directed_yield_successful; }; struct x86_instruction_info; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 16fb395..f08e9b4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -246,6 +246,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), VCPU_STAT("nested_run", nested_run), + VCPU_STAT("directed_yield_attempted", directed_yield_attempted), + VCPU_STAT("directed_yield_successful", directed_yield_successful), VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped), VM_STAT("mmu_pte_write", mmu_pte_write), VM_STAT("mmu_pde_zapped", mmu_pde_zapped), @@ -8211,21 +8213,31 @@ void kvm_apicv_init(struct kvm *kvm, bool enable) } EXPORT_SYMBOL_GPL(kvm_apicv_init); -static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id) +static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id) { struct kvm_vcpu *target = NULL; struct kvm_apic_map *map; + vcpu->stat.directed_yield_attempted++; + rcu_read_lock(); - map = rcu_dereference(kvm->arch.apic_map); + map = rcu_dereference(vcpu->kvm->arch.apic_map); if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id]) target = map->phys_map[dest_id]->vcpu; rcu_read_unlock(); - if (target && READ_ONCE(target->ready)) - kvm_vcpu_yield_to(target); + if (!target || !READ_ONCE(target->ready)) + goto no_yield; + + if (kvm_vcpu_yield_to(target) <= 0) + goto no_yield; + + vcpu->stat.directed_yield_successful++; + +no_yield: + return; } int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) @@ -8272,7 +8284,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) break; kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); - kvm_sched_yield(vcpu->kvm, a1); + kvm_sched_yield(vcpu, a1); ret = 0; break; #ifdef CONFIG_X86_64 @@ -8290,7 +8302,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD)) break; - kvm_sched_yield(vcpu->kvm, a0); + kvm_sched_yield(vcpu, a0); ret = 0; break; default: -- 2.7.4
[PATCH v2 1/3] x86/kvm: Don't bother __pv_cpu_mask when !CONFIG_SMP
From: Wanpeng Li Enable PV TLB shootdown when !CONFIG_SMP doesn't make sense. Let's move it inside CONFIG_SMP. In addition, we can avoid define and alloc __pv_cpu_mask when !CONFIG_SMP and get rid of 'alloc' variable in kvm_alloc_cpumask. Signed-off-by: Wanpeng Li --- v1 -> v2: * shuffle things around a bit more arch/x86/kernel/kvm.c | 118 +++--- 1 file changed, 55 insertions(+), 63 deletions(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5e78e01..224a7a1 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -451,6 +451,10 @@ static void __init sev_map_percpu_data(void) } } +#ifdef CONFIG_SMP + +static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask); + static bool pv_tlb_flush_supported(void) { return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && @@ -458,10 +462,6 @@ static bool pv_tlb_flush_supported(void) kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)); } -static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask); - -#ifdef CONFIG_SMP - static bool pv_ipi_supported(void) { return kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI); @@ -574,6 +574,49 @@ static void kvm_smp_send_call_func_ipi(const struct cpumask *mask) } } +static void kvm_flush_tlb_others(const struct cpumask *cpumask, + const struct flush_tlb_info *info) +{ + u8 state; + int cpu; + struct kvm_steal_time *src; + struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask); + + cpumask_copy(flushmask, cpumask); + /* +* We have to call flush only on online vCPUs. And +* queue flush_on_enter for pre-empted vCPUs +*/ + for_each_cpu(cpu, flushmask) { + src = _cpu(steal_time, cpu); + state = READ_ONCE(src->preempted); + if ((state & KVM_VCPU_PREEMPTED)) { + if (try_cmpxchg(>preempted, , + state | KVM_VCPU_FLUSH_TLB)) + __cpumask_clear_cpu(cpu, flushmask); + } + } + + native_flush_tlb_others(flushmask, info); +} + +static __init int kvm_alloc_cpumask(void) +{ + int cpu; + + if (!kvm_para_available() || nopv) + return 0; + + if (pv_tlb_flush_supported() || pv_ipi_supported()) + for_each_possible_cpu(cpu) { + zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu), + GFP_KERNEL, cpu_to_node(cpu)); + } + + return 0; +} +arch_initcall(kvm_alloc_cpumask); + static void __init kvm_smp_prepare_boot_cpu(void) { /* @@ -611,33 +654,8 @@ static int kvm_cpu_down_prepare(unsigned int cpu) local_irq_enable(); return 0; } -#endif - -static void kvm_flush_tlb_others(const struct cpumask *cpumask, - const struct flush_tlb_info *info) -{ - u8 state; - int cpu; - struct kvm_steal_time *src; - struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask); - - cpumask_copy(flushmask, cpumask); - /* -* We have to call flush only on online vCPUs. And -* queue flush_on_enter for pre-empted vCPUs -*/ - for_each_cpu(cpu, flushmask) { - src = _cpu(steal_time, cpu); - state = READ_ONCE(src->preempted); - if ((state & KVM_VCPU_PREEMPTED)) { - if (try_cmpxchg(>preempted, , - state | KVM_VCPU_FLUSH_TLB)) - __cpumask_clear_cpu(cpu, flushmask); - } - } - native_flush_tlb_others(flushmask, info); -} +#endif static void __init kvm_guest_init(void) { @@ -653,12 +671,6 @@ static void __init kvm_guest_init(void) pv_ops.time.steal_clock = kvm_steal_clock; } - if (pv_tlb_flush_supported()) { - pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others; - pv_ops.mmu.tlb_remove_table = tlb_remove_table; - pr_info("KVM setup pv remote TLB flush\n"); - } - if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) apic_set_eoi_write(kvm_guest_apic_eoi_write); @@ -668,6 +680,12 @@ static void __init kvm_guest_init(void) } #ifdef CONFIG_SMP + if (pv_tlb_flush_supported()) { + pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others; + pv_ops.mmu.tlb_remove_table = tlb_remove_table; + pr_info("KVM setup pv remote TLB flush\n"); + } + smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; if (pv_sched_yield_supported()) { smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi; @@ -734,7 +752,7 @@ static uint32_t __init kvm_detect(void) static void __init kvm_apic_in
Re: [PATCH] KVM: X86: Do not yield to self
On Fri, 9 Apr 2021 at 00:56, Sean Christopherson wrote: > > On Thu, Apr 08, 2021, Wanpeng Li wrote: > > From: Wanpeng Li > > > > If the target is self we do not need to yield, we can avoid malicious > > guest to play this. > > > > Signed-off-by: Wanpeng Li > > --- > > Rebased on > > https://lore.kernel.org/kvm/1617697935-4158-1-git-send-email-wanpen...@tencent.com/ > > > > arch/x86/kvm/x86.c | 4 > > 1 file changed, 4 insertions(+) > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > index 43c9f9b..260650f 100644 > > --- a/arch/x86/kvm/x86.c > > +++ b/arch/x86/kvm/x86.c > > @@ -8230,6 +8230,10 @@ static void kvm_sched_yield(struct kvm_vcpu *vcpu, > > unsigned long dest_id) > > if (!target) > > goto no_yield; > > > > + /* yield to self */ > > If you're going to bother with a comment, maybe elaborate a bit, e.g. > > /* Ignore requests to yield to self. */ Looks good, thanks. Wanpeng
Re: [PATCH] KVM: X86: Count success and invalid yields
On Fri, 9 Apr 2021 at 01:08, Sean Christopherson wrote: > > On Tue, Apr 06, 2021, Wanpeng Li wrote: > > From: Wanpeng Li > > > > To analyze some performance issues with lock contention and scheduling, > > it is nice to know when directed yield are successful or failing. > > > > Signed-off-by: Wanpeng Li > > --- > > arch/x86/include/asm/kvm_host.h | 2 ++ > > arch/x86/kvm/x86.c | 26 -- > > 2 files changed, 22 insertions(+), 6 deletions(-) > > > > diff --git a/arch/x86/include/asm/kvm_host.h > > b/arch/x86/include/asm/kvm_host.h > > index 44f8930..157bcaa 100644 > > --- a/arch/x86/include/asm/kvm_host.h > > +++ b/arch/x86/include/asm/kvm_host.h > > @@ -1126,6 +1126,8 @@ struct kvm_vcpu_stat { > > u64 halt_poll_success_ns; > > u64 halt_poll_fail_ns; > > u64 nested_run; > > + u64 yield_directed; > > + u64 yield_directed_ignore; > > }; > > > > struct x86_instruction_info; > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > index 16fb395..3b475cd 100644 > > --- a/arch/x86/kvm/x86.c > > +++ b/arch/x86/kvm/x86.c > > @@ -246,6 +246,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { > > VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), > > VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), > > VCPU_STAT("nested_run", nested_run), > > + VCPU_STAT("yield_directed", yield_directed), > > This is ambiguous, it's not clear without looking at the code if it's counting > attempts or actual yields. > > > + VCPU_STAT("yield_directed_ignore", yield_directed_ignore), > > "ignored" also feels a bit misleading, as that implies KVM deliberately > ignored > a valid request, whereas many of the failure paths are due to invalid requests > or errors of some kind. > > What about mirroring the halt poll stats, i.e. track "attempted" and > "successful", > as opposed to "attempted" and "ignored/failed".And maybe switched directed > and yield? I.e. directed_yield_attempted and directed_yield_successful. Good suggestion. > > Alternatively, would it make sense to do s/directed/pv, or is that not worth > the > potential risk of being wrong if a non-paravirt use case comes along? > > pv_yield_attempted > pv_yield_successful > > > VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped), > > VM_STAT("mmu_pte_write", mmu_pte_write), > > VM_STAT("mmu_pde_zapped", mmu_pde_zapped), > > @@ -8211,21 +8213,33 @@ void kvm_apicv_init(struct kvm *kvm, bool enable) > > } > > EXPORT_SYMBOL_GPL(kvm_apicv_init); > > > > -static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id) > > +static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id) > > { > > struct kvm_vcpu *target = NULL; > > struct kvm_apic_map *map; > > > > + vcpu->stat.yield_directed++; > > + > > rcu_read_lock(); > > - map = rcu_dereference(kvm->arch.apic_map); > > + map = rcu_dereference(vcpu->kvm->arch.apic_map); > > > > if (likely(map) && dest_id <= map->max_apic_id && > > map->phys_map[dest_id]) > > target = map->phys_map[dest_id]->vcpu; > > > > rcu_read_unlock(); > > + if (!target) > > + goto no_yield; > > + > > + if (!READ_ONCE(target->ready)) > > I vote to keep these checks together. That'll also make the addition of the > "don't yield to self" check match the order of ready vs. self in > kvm_vcpu_on_spin(). Do it in v2. Wanpeng
Re: [PATCH] x86/kvm: Don't alloc __pv_cpu_mask when !CONFIG_SMP
On Fri, 9 Apr 2021 at 04:20, Sean Christopherson wrote: > > On Wed, Apr 07, 2021, Wanpeng Li wrote: > > From: Wanpeng Li > > > > Enable PV TLB shootdown when !CONFIG_SMP doesn't make sense. Let's move > > it inside CONFIG_SMP. In addition, we can avoid alloc __pv_cpu_mask when > > !CONFIG_SMP and get rid of 'alloc' variable in kvm_alloc_cpumask. > > ... > > > +static bool pv_tlb_flush_supported(void) { return false; } > > +static bool pv_ipi_supported(void) { return false; } > > +static void kvm_flush_tlb_others(const struct cpumask *cpumask, > > + const struct flush_tlb_info *info) { } > > +static void kvm_setup_pv_ipi(void) { } > > If you shuffle things around a bit more, you can avoid these stubs, and hide > the > definition of __pv_cpu_mask behind CONFIG_SMP, too. Thanks, I will move around. Wanpeng
[PATCH] KVM: X86: Do not yield to self
From: Wanpeng Li If the target is self we do not need to yield, we can avoid malicious guest to play this. Signed-off-by: Wanpeng Li --- Rebased on https://lore.kernel.org/kvm/1617697935-4158-1-git-send-email-wanpen...@tencent.com/ arch/x86/kvm/x86.c | 4 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 43c9f9b..260650f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8230,6 +8230,10 @@ static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id) if (!target) goto no_yield; + /* yield to self */ + if (vcpu->vcpu_id == target->vcpu_id) + goto no_yield; + if (!READ_ONCE(target->ready)) goto no_yield; -- 2.7.4
Re: [PATCH v2 07/17] KVM: x86/mmu: Check PDPTRs before allocating PAE roots
On Fri, 5 Mar 2021 at 09:12, Sean Christopherson wrote: > > Check the validity of the PDPTRs before allocating any of the PAE roots, > otherwise a bad PDPTR will cause KVM to leak any previously allocated > roots. > > Signed-off-by: Sean Christopherson > --- > arch/x86/kvm/mmu/mmu.c | 20 ++-- > 1 file changed, 14 insertions(+), 6 deletions(-) > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index 7ebfbc77b050..9fc2b46f8541 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -3269,7 +3269,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) > static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) > { > struct kvm_mmu *mmu = vcpu->arch.mmu; > - u64 pdptr, pm_mask; > + u64 pdptrs[4], pm_mask; > gfn_t root_gfn, root_pgd; > hpa_t root; > int i; > @@ -3280,6 +3280,17 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu > *vcpu) > if (mmu_check_root(vcpu, root_gfn)) > return 1; > > + if (mmu->root_level == PT32E_ROOT_LEVEL) { > + for (i = 0; i < 4; ++i) { > + pdptrs[i] = mmu->get_pdptr(vcpu, i); > + if (!(pdptrs[i] & PT_PRESENT_MASK)) > + continue; > + > + if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT)) > + return 1; > + } > + } > + I saw this splatting: BUG: sleeping function called from invalid context at arch/x86/kvm/kvm_cache_regs.h:115 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 3090, name: qemu-system-x86 3 locks held by qemu-system-x86/3090: #0: 8d499f8d00d0 (>mutex){+.+.}-{3:3}, at: kvm_vcpu_ioctl+0x8e/0x680 [kvm] #1: b1b540f873e8 (>srcu){}-{0:0}, at: vcpu_enter_guest+0xb30/0x1b10 [kvm] #2: b1b540f7d018 (&(kvm)->mmu_lock){+.+.}-{2:2}, at: kvm_mmu_load+0xb5/0x540 [kvm] Preemption disabled at: [] kvm_mmu_load+0xb5/0x540 [kvm] CPU: 2 PID: 3090 Comm: qemu-system-x86 Tainted: GW OE 5.12.0-rc3+ #3 Call Trace: dump_stack+0x87/0xb7 ___might_sleep+0x202/0x250 __might_sleep+0x4a/0x80 kvm_pdptr_read+0x20/0x60 [kvm] kvm_mmu_load+0x3bd/0x540 [kvm] vcpu_enter_guest+0x1297/0x1b10 [kvm] kvm_arch_vcpu_ioctl_run+0x372/0x690 [kvm] kvm_vcpu_ioctl+0x3ca/0x680 [kvm] __x64_sys_ioctl+0x27a/0x800 do_syscall_64+0x38/0x50 entry_SYSCALL_64_after_hwframe+0x44/0xae There is a might_sleep() in kvm_pdptr_read(), however, the original commit didn't explain more. I can send a formal one if the below fix is acceptable. diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index efb41f3..f33026f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3289,17 +3289,24 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) if (mmu_check_root(vcpu, root_gfn)) return 1; +write_unlock(>kvm->mmu_lock); if (mmu->root_level == PT32E_ROOT_LEVEL) { for (i = 0; i < 4; ++i) { pdptrs[i] = mmu->get_pdptr(vcpu, i); if (!(pdptrs[i] & PT_PRESENT_MASK)) continue; -if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT)) +if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT)) { +write_lock(>kvm->mmu_lock); return 1; +} } } +write_lock(>kvm->mmu_lock); +if (make_mmu_pages_available(vcpu)) +return -ENOSPC; + /* * Do we shadow a long mode page table? If so we need to * write-protect the guests page table root.
Re: [PATCH v2] KVM: Explicitly use GFP_KERNEL_ACCOUNT for 'struct kvm_vcpu' allocations
On Wed, 7 Apr 2021 at 03:07, Sean Christopherson wrote: > > Use GFP_KERNEL_ACCOUNT when allocating vCPUs to make it more obvious that > that the allocations are accounted, to make it easier to audit KVM's > allocations in the future, and to be consistent with other cache usage in > KVM. > > When using SLAB/SLUB, this is a nop as the cache itself is created with > SLAB_ACCOUNT. > > When using SLOB, there are caveats within caveats. SLOB doesn't honor > SLAB_ACCOUNT, so passing GFP_KERNEL_ACCOUNT will result in vCPU > allocations now being accounted. But, even that depends on internal > SLOB details as SLOB will only go to the page allocator when its cache is > depleted. That just happens to be extremely likely for vCPUs because the > size of kvm_vcpu is larger than the a page for almost all combinations of > architecture and page size. Whether or not the SLOB behavior is by > design is unknown; it's just as likely that no SLOB users care about > accounding and so no one has bothered to implemented support in SLOB. > Regardless, accounting vCPU allocations will not break SLOB+KVM+cgroup > users, if any exist. > > Cc: Wanpeng Li > Signed-off-by: Sean Christopherson Reviewed-by: Wanpeng Li > --- > > v2: Drop the Fixes tag and rewrite the changelog since this is a nop when > using SLUB or SLAB. [Wanpeng] > > virt/kvm/kvm_main.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index 0a481e7780f0..580f98386b42 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -3192,7 +3192,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, > u32 id) > if (r) > goto vcpu_decrement; > > - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); > + vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); > if (!vcpu) { > r = -ENOMEM; > goto vcpu_decrement; > -- > 2.31.0.208.g409f899ff0-goog >
[PATCH] x86/kvm: Don't alloc __pv_cpu_mask when !CONFIG_SMP
From: Wanpeng Li Enable PV TLB shootdown when !CONFIG_SMP doesn't make sense. Let's move it inside CONFIG_SMP. In addition, we can avoid alloc __pv_cpu_mask when !CONFIG_SMP and get rid of 'alloc' variable in kvm_alloc_cpumask. Signed-off-by: Wanpeng Li --- arch/x86/kernel/kvm.c | 79 +-- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5e78e01..202e1f0 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -451,6 +451,11 @@ static void __init sev_map_percpu_data(void) } } + +static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask); + +#ifdef CONFIG_SMP + static bool pv_tlb_flush_supported(void) { return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && @@ -458,10 +463,6 @@ static bool pv_tlb_flush_supported(void) kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)); } -static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask); - -#ifdef CONFIG_SMP - static bool pv_ipi_supported(void) { return kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI); @@ -574,6 +575,32 @@ static void kvm_smp_send_call_func_ipi(const struct cpumask *mask) } } +static void kvm_flush_tlb_others(const struct cpumask *cpumask, + const struct flush_tlb_info *info) +{ + u8 state; + int cpu; + struct kvm_steal_time *src; + struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask); + + cpumask_copy(flushmask, cpumask); + /* +* We have to call flush only on online vCPUs. And +* queue flush_on_enter for pre-empted vCPUs +*/ + for_each_cpu(cpu, flushmask) { + src = _cpu(steal_time, cpu); + state = READ_ONCE(src->preempted); + if ((state & KVM_VCPU_PREEMPTED)) { + if (try_cmpxchg(>preempted, , + state | KVM_VCPU_FLUSH_TLB)) + __cpumask_clear_cpu(cpu, flushmask); + } + } + + native_flush_tlb_others(flushmask, info); +} + static void __init kvm_smp_prepare_boot_cpu(void) { /* @@ -611,33 +638,16 @@ static int kvm_cpu_down_prepare(unsigned int cpu) local_irq_enable(); return 0; } -#endif -static void kvm_flush_tlb_others(const struct cpumask *cpumask, - const struct flush_tlb_info *info) -{ - u8 state; - int cpu; - struct kvm_steal_time *src; - struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask); +#else - cpumask_copy(flushmask, cpumask); - /* -* We have to call flush only on online vCPUs. And -* queue flush_on_enter for pre-empted vCPUs -*/ - for_each_cpu(cpu, flushmask) { - src = _cpu(steal_time, cpu); - state = READ_ONCE(src->preempted); - if ((state & KVM_VCPU_PREEMPTED)) { - if (try_cmpxchg(>preempted, , - state | KVM_VCPU_FLUSH_TLB)) - __cpumask_clear_cpu(cpu, flushmask); - } - } +static bool pv_tlb_flush_supported(void) { return false; } +static bool pv_ipi_supported(void) { return false; } +static void kvm_flush_tlb_others(const struct cpumask *cpumask, + const struct flush_tlb_info *info) { } +static void kvm_setup_pv_ipi(void) { } - native_flush_tlb_others(flushmask, info); -} +#endif static void __init kvm_guest_init(void) { @@ -734,10 +744,8 @@ static uint32_t __init kvm_detect(void) static void __init kvm_apic_init(void) { -#if defined(CONFIG_SMP) if (pv_ipi_supported()) kvm_setup_pv_ipi(); -#endif } static bool __init kvm_msi_ext_dest_id(void) @@ -797,20 +805,11 @@ arch_initcall(activate_jump_labels); static __init int kvm_alloc_cpumask(void) { int cpu; - bool alloc = false; if (!kvm_para_available() || nopv) return 0; - if (pv_tlb_flush_supported()) - alloc = true; - -#if defined(CONFIG_SMP) - if (pv_ipi_supported()) - alloc = true; -#endif - - if (alloc) + if (pv_tlb_flush_supported() || pv_ipi_supported()) for_each_possible_cpu(cpu) { zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu), GFP_KERNEL, cpu_to_node(cpu)); -- 2.7.4
[PATCH] KVM: X86: Count success and invalid yields
From: Wanpeng Li To analyze some performance issues with lock contention and scheduling, it is nice to know when directed yield are successful or failing. Signed-off-by: Wanpeng Li --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/x86.c | 26 -- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 44f8930..157bcaa 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1126,6 +1126,8 @@ struct kvm_vcpu_stat { u64 halt_poll_success_ns; u64 halt_poll_fail_ns; u64 nested_run; + u64 yield_directed; + u64 yield_directed_ignore; }; struct x86_instruction_info; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 16fb395..3b475cd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -246,6 +246,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), VCPU_STAT("nested_run", nested_run), + VCPU_STAT("yield_directed", yield_directed), + VCPU_STAT("yield_directed_ignore", yield_directed_ignore), VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped), VM_STAT("mmu_pte_write", mmu_pte_write), VM_STAT("mmu_pde_zapped", mmu_pde_zapped), @@ -8211,21 +8213,33 @@ void kvm_apicv_init(struct kvm *kvm, bool enable) } EXPORT_SYMBOL_GPL(kvm_apicv_init); -static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id) +static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id) { struct kvm_vcpu *target = NULL; struct kvm_apic_map *map; + vcpu->stat.yield_directed++; + rcu_read_lock(); - map = rcu_dereference(kvm->arch.apic_map); + map = rcu_dereference(vcpu->kvm->arch.apic_map); if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id]) target = map->phys_map[dest_id]->vcpu; rcu_read_unlock(); + if (!target) + goto no_yield; + + if (!READ_ONCE(target->ready)) + goto no_yield; - if (target && READ_ONCE(target->ready)) - kvm_vcpu_yield_to(target); + if (kvm_vcpu_yield_to(target) <= 0) + goto no_yield; + return; + +no_yield: + vcpu->stat.yield_directed_ignore++; + return; } int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) @@ -8272,7 +8286,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) break; kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); - kvm_sched_yield(vcpu->kvm, a1); + kvm_sched_yield(vcpu, a1); ret = 0; break; #ifdef CONFIG_X86_64 @@ -8290,7 +8304,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD)) break; - kvm_sched_yield(vcpu->kvm, a0); + kvm_sched_yield(vcpu, a0); ret = 0; break; default: -- 2.7.4
Re: [PATCH 1/2] KVM: Account memory allocations for 'struct kvm_vcpu'
On Wed, 31 Mar 2021 at 11:24, Sean Christopherson wrote: > > On Wed, Mar 31, 2021, Wanpeng Li wrote: > > On Wed, 31 Mar 2021 at 10:32, Sean Christopherson wrote: > > > > > > Use GFP_KERNEL_ACCOUNT for the vCPU allocations, the vCPUs are very much > > > tied to a single task/VM. For x86, the allocations were accounted up > > > until the allocation code was moved to common KVM. For all other > > > architectures, vCPU allocations were never previously accounted, but only > > > because most architectures lack accounting in general (for KVM). > > > > > > Fixes: e529ef66e6b5 ("KVM: Move vcpu alloc and init invocation to common > > > code") > > > Signed-off-by: Sean Christopherson > > > --- > > > virt/kvm/kvm_main.c | 2 +- > > > 1 file changed, 1 insertion(+), 1 deletion(-) > > > > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > > > index 383df23514b9..3884e9f30251 100644 > > > --- a/virt/kvm/kvm_main.c > > > +++ b/virt/kvm/kvm_main.c > > > @@ -3182,7 +3182,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm > > > *kvm, u32 id) > > > if (r) > > > goto vcpu_decrement; > > > > > > - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); > > > + vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); > > > > kvm_vcpu_cache is created with SLAB_ACCOUNT flag in kvm_init(), this > > flag will guarantee further slab alloc will be charged to memcg. > > Please refer to memcg_slab_pre_alloc_hook(). So the patch is > > unnecessary. > > Hmm, I missed that. However, AFICT only SLAB/SLUB enforce SLAB_ACCOUNT, SLOB > does not appear to honor the flag. The caveat to SLOB is that the > GFP_KERNEL_ACCOUNT will only come into play when allocating new pages, and so > allocations smaller than a page will be accounted incorrectly (I think). > But, a vcpu is larger than a page (on x86), which means the vcpu allocation > will > always be correctly accounted. > > I've no idea if anyone actually uses KVM+SLOB, let alone cares about > accounting I asked maintainer Christoph in 2013, he told me "Well, I have never seen non experimental systems that use SLOB. Others have claimed they exist. It's mostly of academic interest." > in the that case. But, it would be nice for KVM to be consistent with the > other > kmem_cache usage in KVM, all of which do double up on SLAB_ACCOUNT + > GFP_KERNEL_ACCOUNT. > > Maybe rewrite the changelog and drop the Fixes? Agreed. Wanpeng
Re: [PATCH 1/2] KVM: Account memory allocations for 'struct kvm_vcpu'
On Wed, 31 Mar 2021 at 10:32, Sean Christopherson wrote: > > Use GFP_KERNEL_ACCOUNT for the vCPU allocations, the vCPUs are very much > tied to a single task/VM. For x86, the allocations were accounted up > until the allocation code was moved to common KVM. For all other > architectures, vCPU allocations were never previously accounted, but only > because most architectures lack accounting in general (for KVM). > > Fixes: e529ef66e6b5 ("KVM: Move vcpu alloc and init invocation to common > code") > Signed-off-by: Sean Christopherson > --- > virt/kvm/kvm_main.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index 383df23514b9..3884e9f30251 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -3182,7 +3182,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, > u32 id) > if (r) > goto vcpu_decrement; > > - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); > + vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); kvm_vcpu_cache is created with SLAB_ACCOUNT flag in kvm_init(), this flag will guarantee further slab alloc will be charged to memcg. Please refer to memcg_slab_pre_alloc_hook(). So the patch is unnecessary. Wanpeng
Re: [PATCH 2/2] KVM: x86: disable interrupts while pvclock_gtod_sync_lock is taken
On Wed, 31 Mar 2021 at 01:01, Paolo Bonzini wrote: > > pvclock_gtod_sync_lock can be taken with interrupts disabled if the > preempt notifier calls get_kvmclock_ns to update the Xen > runstate information: > >spin_lock include/linux/spinlock.h:354 [inline] >get_kvmclock_ns+0x25/0x390 arch/x86/kvm/x86.c:2587 >kvm_xen_update_runstate+0x3d/0x2c0 arch/x86/kvm/xen.c:69 >kvm_xen_update_runstate_guest+0x74/0x320 arch/x86/kvm/xen.c:100 >kvm_xen_runstate_set_preempted arch/x86/kvm/xen.h:96 [inline] >kvm_arch_vcpu_put+0x2d8/0x5a0 arch/x86/kvm/x86.c:4062 > > So change the users of the spinlock to spin_lock_irqsave and > spin_unlock_irqrestore. > > Reported-by: syzbot+b282b65c2c68492df...@syzkaller.appspotmail.com > Fixes: 30b5c851af79 ("KVM: x86/xen: Add support for vCPU runstate > information") > Cc: David Woodhouse > Cc: Marcelo Tosatti > Signed-off-by: Paolo Bonzini Reviewed-by: Wanpeng Li > --- > arch/x86/kvm/x86.c | 25 ++--- > 1 file changed, 14 insertions(+), 11 deletions(-) > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 0a83eff40b43..2bfd00da465f 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -2329,7 +2329,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, > u64 data) > kvm_vcpu_write_tsc_offset(vcpu, offset); > raw_spin_unlock_irqrestore(>arch.tsc_write_lock, flags); > > - spin_lock(>arch.pvclock_gtod_sync_lock); > + spin_lock_irqsave(>arch.pvclock_gtod_sync_lock, flags); > if (!matched) { > kvm->arch.nr_vcpus_matched_tsc = 0; > } else if (!already_matched) { > @@ -2337,7 +2337,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, > u64 data) > } > > kvm_track_tsc_matching(vcpu); > - spin_unlock(>arch.pvclock_gtod_sync_lock); > + spin_unlock_irqrestore(>arch.pvclock_gtod_sync_lock, flags); > } > > static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, > @@ -2559,15 +2559,16 @@ static void kvm_gen_update_masterclock(struct kvm > *kvm) > int i; > struct kvm_vcpu *vcpu; > struct kvm_arch *ka = >arch; > + unsigned long flags; > > kvm_hv_invalidate_tsc_page(kvm); > > kvm_make_mclock_inprogress_request(kvm); > > /* no guest entries from this point */ > - spin_lock(>pvclock_gtod_sync_lock); > + spin_lock_irqsave(>pvclock_gtod_sync_lock, flags); > pvclock_update_vm_gtod_copy(kvm); > - spin_unlock(>pvclock_gtod_sync_lock); > + spin_unlock_irqrestore(>pvclock_gtod_sync_lock, flags); > > kvm_for_each_vcpu(i, vcpu, kvm) > kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); > @@ -2582,17 +2583,18 @@ u64 get_kvmclock_ns(struct kvm *kvm) > { > struct kvm_arch *ka = >arch; > struct pvclock_vcpu_time_info hv_clock; > + unsigned long flags; > u64 ret; > > - spin_lock(>pvclock_gtod_sync_lock); > + spin_lock_irqsave(>pvclock_gtod_sync_lock, flags); > if (!ka->use_master_clock) { > - spin_unlock(>pvclock_gtod_sync_lock); > + spin_unlock_irqrestore(>pvclock_gtod_sync_lock, flags); > return get_kvmclock_base_ns() + ka->kvmclock_offset; > } > > hv_clock.tsc_timestamp = ka->master_cycle_now; > hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; > - spin_unlock(>pvclock_gtod_sync_lock); > + spin_unlock_irqrestore(>pvclock_gtod_sync_lock, flags); > > /* both __this_cpu_read() and rdtsc() should be on the same cpu */ > get_cpu(); > @@ -2686,13 +2688,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) > * If the host uses TSC clock, then passthrough TSC as stable > * to the guest. > */ > - spin_lock(>pvclock_gtod_sync_lock); > + spin_lock_irqsave(>pvclock_gtod_sync_lock, flags); > use_master_clock = ka->use_master_clock; > if (use_master_clock) { > host_tsc = ka->master_cycle_now; > kernel_ns = ka->master_kernel_ns; > } > - spin_unlock(>pvclock_gtod_sync_lock); > + spin_unlock_irqrestore(>pvclock_gtod_sync_lock, flags); > > /* Keep irq disabled to prevent changes to the clock */ > local_irq_save(flags); > @@ -7724,6 +7726,7 @@ static void kvm_hyperv_tsc_notifier(void) > struct kvm *kvm; > struct kvm_vcpu *vcpu; > int cpu; > + unsigned long flags; > >
Re: [PATCH 1/2] KVM: x86: reduce pvclock_gtod_sync_lock critical sections
On Wed, 31 Mar 2021 at 01:02, Paolo Bonzini wrote: > > There is no need to include changes to vcpu->requests into > the pvclock_gtod_sync_lock critical section. The changes to > the shared data structures (in pvclock_update_vm_gtod_copy) > already occur under the lock. > > Cc: David Woodhouse > Cc: Marcelo Tosatti > Signed-off-by: Paolo Bonzini Reviewed-by: Wanpeng Li > --- > arch/x86/kvm/x86.c | 10 -- > 1 file changed, 4 insertions(+), 6 deletions(-) > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index fe806e894212..0a83eff40b43 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -2562,10 +2562,12 @@ static void kvm_gen_update_masterclock(struct kvm > *kvm) > > kvm_hv_invalidate_tsc_page(kvm); > > - spin_lock(>pvclock_gtod_sync_lock); > kvm_make_mclock_inprogress_request(kvm); > + > /* no guest entries from this point */ > + spin_lock(>pvclock_gtod_sync_lock); > pvclock_update_vm_gtod_copy(kvm); > + spin_unlock(>pvclock_gtod_sync_lock); > > kvm_for_each_vcpu(i, vcpu, kvm) > kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); > @@ -2573,8 +2575,6 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) > /* guest entries allowed */ > kvm_for_each_vcpu(i, vcpu, kvm) > kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); > - > - spin_unlock(>pvclock_gtod_sync_lock); > #endif > } > > @@ -7740,16 +7740,14 @@ static void kvm_hyperv_tsc_notifier(void) > struct kvm_arch *ka = >arch; > > spin_lock(>pvclock_gtod_sync_lock); > - > pvclock_update_vm_gtod_copy(kvm); > + spin_unlock(>pvclock_gtod_sync_lock); > > kvm_for_each_vcpu(cpu, vcpu, kvm) > kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); > > kvm_for_each_vcpu(cpu, vcpu, kvm) > kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); > - > - spin_unlock(>pvclock_gtod_sync_lock); > } > mutex_unlock(_lock); > } > -- > 2.26.2 > >
Re: [PATCH] KVM: X86: Properly account for guest CPU time when considering context tracking
On Tue, 30 Mar 2021 at 01:15, Sean Christopherson wrote: > > +Thomas > > On Mon, Mar 29, 2021, Wanpeng Li wrote: > > From: Wanpeng Li > > > > The bugzilla https://bugzilla.kernel.org/show_bug.cgi?id=209831 > > reported that the guest time remains 0 when running a while true > > loop in the guest. > > > > The commit 87fa7f3e98a131 ("x86/kvm: Move context tracking where it > > belongs") moves guest_exit_irqoff() close to vmexit breaks the > > tick-based time accouting when the ticks that happen after IRQs are > > disabled are incorrectly accounted to the host/system time. This is > > because we exit the guest state too early. > > > > vtime-based time accounting is tied to context tracking, keep the > > guest_exit_irqoff() around vmexit code when both vtime-based time > > accounting and specific cpu is context tracking mode active. > > Otherwise, leave guest_exit_irqoff() after handle_exit_irqoff() > > and explicit IRQ window for tick-based time accouting. > > > > Fixes: 87fa7f3e98a131 ("x86/kvm: Move context tracking where it belongs") > > Cc: Sean Christopherson > > Signed-off-by: Wanpeng Li > > --- > > arch/x86/kvm/svm/svm.c | 3 ++- > > arch/x86/kvm/vmx/vmx.c | 3 ++- > > arch/x86/kvm/x86.c | 2 ++ > > 3 files changed, 6 insertions(+), 2 deletions(-) > > > > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c > > index 58a45bb..55fb5ce 100644 > > --- a/arch/x86/kvm/svm/svm.c > > +++ b/arch/x86/kvm/svm/svm.c > > @@ -3812,7 +3812,8 @@ static noinstr void svm_vcpu_enter_exit(struct > > kvm_vcpu *vcpu, > >* into world and some more. > >*/ > > lockdep_hardirqs_off(CALLER_ADDR0); > > - guest_exit_irqoff(); > > + if (vtime_accounting_enabled_this_cpu()) > > + guest_exit_irqoff(); > > > > instrumentation_begin(); > > trace_hardirqs_off_finish(); > > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c > > index 32cf828..85695b3 100644 > > --- a/arch/x86/kvm/vmx/vmx.c > > +++ b/arch/x86/kvm/vmx/vmx.c > > @@ -6689,7 +6689,8 @@ static noinstr void vmx_vcpu_enter_exit(struct > > kvm_vcpu *vcpu, > >* into world and some more. > >*/ > > lockdep_hardirqs_off(CALLER_ADDR0); > > - guest_exit_irqoff(); > > + if (vtime_accounting_enabled_this_cpu()) > > + guest_exit_irqoff(); > > This looks ok, as CONFIG_CONTEXT_TRACKING and CONFIG_VIRT_CPU_ACCOUNTING_GEN > are > selected by CONFIG_NO_HZ_FULL=y, and can't be enabled independently, e.g. the > rcu_user_exit() call won't be delayed because it will never be called in the > !vtime case. But it still feels wrong poking into those details, e.g. it'll > be weird and/or wrong guest_exit_irqoff() gains stuff that isn't vtime > specific. Could you elaborate what's the meaning of "it'll be weird and/or wrong guest_exit_irqoff() gains stuff that isn't vtime specific."? Wanpeng
[PATCH] KVM: X86: Properly account for guest CPU time when considering context tracking
From: Wanpeng Li The bugzilla https://bugzilla.kernel.org/show_bug.cgi?id=209831 reported that the guest time remains 0 when running a while true loop in the guest. The commit 87fa7f3e98a131 ("x86/kvm: Move context tracking where it belongs") moves guest_exit_irqoff() close to vmexit breaks the tick-based time accouting when the ticks that happen after IRQs are disabled are incorrectly accounted to the host/system time. This is because we exit the guest state too early. vtime-based time accounting is tied to context tracking, keep the guest_exit_irqoff() around vmexit code when both vtime-based time accounting and specific cpu is context tracking mode active. Otherwise, leave guest_exit_irqoff() after handle_exit_irqoff() and explicit IRQ window for tick-based time accouting. Fixes: 87fa7f3e98a131 ("x86/kvm: Move context tracking where it belongs") Cc: Sean Christopherson Signed-off-by: Wanpeng Li --- arch/x86/kvm/svm/svm.c | 3 ++- arch/x86/kvm/vmx/vmx.c | 3 ++- arch/x86/kvm/x86.c | 2 ++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 58a45bb..55fb5ce 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3812,7 +3812,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, * into world and some more. */ lockdep_hardirqs_off(CALLER_ADDR0); - guest_exit_irqoff(); + if (vtime_accounting_enabled_this_cpu()) + guest_exit_irqoff(); instrumentation_begin(); trace_hardirqs_off_finish(); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 32cf828..85695b3 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6689,7 +6689,8 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, * into world and some more. */ lockdep_hardirqs_off(CALLER_ADDR0); - guest_exit_irqoff(); + if (vtime_accounting_enabled_this_cpu()) + guest_exit_irqoff(); instrumentation_begin(); trace_hardirqs_off_finish(); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fe806e8..234c8b3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9185,6 +9185,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) ++vcpu->stat.exits; local_irq_disable(); kvm_after_interrupt(vcpu); + if (!vtime_accounting_enabled_this_cpu()) + guest_exit_irqoff(); if (lapic_in_kernel(vcpu)) { s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta; -- 2.7.4
Re: [syzbot] possible deadlock in scheduler_tick
Cc David Woodhouse, On Wed, 24 Mar 2021 at 18:11, syzbot wrote: > > Hello, > > syzbot found the following issue on: > > HEAD commit:1c273e10 Merge tag 'zonefs-5.12-rc4' of git://git.kernel.o.. > git tree: upstream > console output: https://syzkaller.appspot.com/x/log.txt?x=13c0414ed0 > kernel config: https://syzkaller.appspot.com/x/.config?x=6abda3336c698a07 > dashboard link: https://syzkaller.appspot.com/bug?extid=b282b65c2c68492df769 > userspace arch: i386 > syz repro: https://syzkaller.appspot.com/x/repro.syz?x=17d86ad6d0 > C reproducer: https://syzkaller.appspot.com/x/repro.c?x=17b8497cd0 > > IMPORTANT: if you fix the issue, please add the following tag to the commit: > Reported-by: syzbot+b282b65c2c68492df...@syzkaller.appspotmail.com > > = > WARNING: HARDIRQ-safe -> HARDIRQ-unsafe lock order detected > 5.12.0-rc3-syzkaller #0 Not tainted > - > syz-executor030/8435 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire: > c90001a2a230 (>arch.pvclock_gtod_sync_lock){+.+.}-{2:2}, at: > spin_lock include/linux/spinlock.h:354 [inline] > c90001a2a230 (>arch.pvclock_gtod_sync_lock){+.+.}-{2:2}, at: > get_kvmclock_ns+0x25/0x390 arch/x86/kvm/x86.c:2587 > > and this task is already holding: > 8880b9d35198 (>lock){-.-.}-{2:2}, at: rq_lock > kernel/sched/sched.h:1321 [inline] > 8880b9d35198 (>lock){-.-.}-{2:2}, at: __schedule+0x21c/0x21b0 > kernel/sched/core.c:4990 > which would create a new lock dependency: > (>lock){-.-.}-{2:2} -> (>arch.pvclock_gtod_sync_lock){+.+.}-{2:2} > > but this new dependency connects a HARDIRQ-irq-safe lock: > (>lock){-.-.}-{2:2} > > ... which became HARDIRQ-irq-safe at: > lock_acquire kernel/locking/lockdep.c:5510 [inline] > lock_acquire+0x1ab/0x740 kernel/locking/lockdep.c:5475 > __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] > _raw_spin_lock+0x2a/0x40 kernel/locking/spinlock.c:151 > rq_lock kernel/sched/sched.h:1321 [inline] > scheduler_tick+0xa4/0x4b0 kernel/sched/core.c:4538 > update_process_times+0x191/0x200 kernel/time/timer.c:1801 > tick_periodic+0x79/0x230 kernel/time/tick-common.c:100 > tick_handle_periodic+0x41/0x120 kernel/time/tick-common.c:112 > timer_interrupt+0x3f/0x60 arch/x86/kernel/time.c:57 > __handle_irq_event_percpu+0x303/0x8f0 kernel/irq/handle.c:156 > handle_irq_event_percpu kernel/irq/handle.c:196 [inline] > handle_irq_event+0x102/0x290 kernel/irq/handle.c:213 > handle_level_irq+0x256/0x6e0 kernel/irq/chip.c:650 > generic_handle_irq_desc include/linux/irqdesc.h:158 [inline] > handle_irq arch/x86/kernel/irq.c:231 [inline] > __common_interrupt+0x9e/0x200 arch/x86/kernel/irq.c:250 > common_interrupt+0x9f/0xd0 arch/x86/kernel/irq.c:240 > asm_common_interrupt+0x1e/0x40 arch/x86/include/asm/idtentry.h:623 > __raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:161 [inline] > _raw_spin_unlock_irqrestore+0x38/0x70 kernel/locking/spinlock.c:191 > __setup_irq+0xc72/0x1ce0 kernel/irq/manage.c:1737 > request_threaded_irq+0x28a/0x3b0 kernel/irq/manage.c:2127 > request_irq include/linux/interrupt.h:160 [inline] > setup_default_timer_irq arch/x86/kernel/time.c:70 [inline] > hpet_time_init+0x28/0x42 arch/x86/kernel/time.c:82 > x86_late_time_init+0x58/0x94 arch/x86/kernel/time.c:94 > start_kernel+0x3ee/0x496 init/main.c:1028 > secondary_startup_64_no_verify+0xb0/0xbb > > to a HARDIRQ-irq-unsafe lock: > (>arch.pvclock_gtod_sync_lock){+.+.}-{2:2} > > ... which became HARDIRQ-irq-unsafe at: > ... > lock_acquire kernel/locking/lockdep.c:5510 [inline] > lock_acquire+0x1ab/0x740 kernel/locking/lockdep.c:5475 > __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] > _raw_spin_lock+0x2a/0x40 kernel/locking/spinlock.c:151 > spin_lock include/linux/spinlock.h:354 [inline] > kvm_synchronize_tsc+0x459/0x1230 arch/x86/kvm/x86.c:2332 > kvm_arch_vcpu_postcreate+0x73/0x180 arch/x86/kvm/x86.c:10183 > kvm_vm_ioctl_create_vcpu arch/x86/kvm/../../../virt/kvm/kvm_main.c:3239 > [inline] > kvm_vm_ioctl+0x1b2d/0x2800 arch/x86/kvm/../../../virt/kvm/kvm_main.c:3839 > kvm_vm_compat_ioctl+0x125/0x230 > arch/x86/kvm/../../../virt/kvm/kvm_main.c:4052 > __do_compat_sys_ioctl+0x1d3/0x230 fs/ioctl.c:842 > do_syscall_32_irqs_on arch/x86/entry/common.c:77 [inline] > __do_fast_syscall_32+0x56/0x90 arch/x86/entry/common.c:140 > do_fast_syscall_32+0x2f/0x70 arch/x86/entry/common.c:165 > entry_SYSENTER_compat_after_hwframe+0x4d/0x5c > > other info that might help us debug this: > > Possible interrupt unsafe locking scenario: > >CPU0CPU1 > > lock(>arch.pvclock_gtod_sync_lock); >local_irq_disable(); >lock(>lock); >lock(>arch.pvclock_gtod_sync_lock); > > lock(>lock); > The offender is
Re: [PATCH] KVM: arm: memcg awareness
On Wed, 17 Mar 2021 at 16:04, Wanpeng Li wrote: > > On Wed, 17 Mar 2021 at 15:57, Michal Hocko wrote: > > > > On Wed 17-03-21 13:46:24, Wanpeng Li wrote: > > > From: Wanpeng Li > > > > > > KVM allocations in the arm kvm code which are tied to the life > > > of the VM process should be charged to the VM process's cgroup. > > > > How much memory are we talking about? > > > > > This will help the memcg controler to do the right decisions. > > > > This is a bit vague. What is the right decision? AFAICS none of that > > memory is considered during oom victim selection. The only thing memcg > > controler can help with is to contain and account this additional > > memory. This might help to better isolate multiple workloads on the same > > system. Maybe this is what you wanted to say? Or maybe this is a way to > > prevent untrusted users from consuming a lot of memory? > https://patchwork.kernel.org/project/kvm/patch/20190211190252.198101-1-bgar...@google.com/ > It is explained in this patchset for x86 kvm which is upstream, I > think I don't need to copy and paste. :) > > Wanpeng
Re: [PATCH] KVM: arm: memcg awareness
On Wed, 17 Mar 2021 at 15:57, Michal Hocko wrote: > > On Wed 17-03-21 13:46:24, Wanpeng Li wrote: > > From: Wanpeng Li > > > > KVM allocations in the arm kvm code which are tied to the life > > of the VM process should be charged to the VM process's cgroup. > > How much memory are we talking about? > > > This will help the memcg controler to do the right decisions. > > This is a bit vague. What is the right decision? AFAICS none of that > memory is considered during oom victim selection. The only thing memcg > controler can help with is to contain and account this additional > memory. This might help to better isolate multiple workloads on the same > system. Maybe this is what you wanted to say? Or maybe this is a way to > prevent untrusted users from consuming a lot of memory? It is explained in this patchset for x86 kvm which is upstream, I think I don't need to copy and paste. :) Wanpeng
[PATCH] KVM: arm: memcg awareness
From: Wanpeng Li KVM allocations in the arm kvm code which are tied to the life of the VM process should be charged to the VM process's cgroup. This will help the memcg controler to do the right decisions. Signed-off-by: Wanpeng Li --- arch/arm64/kvm/arm.c | 5 +++-- arch/arm64/kvm/hyp/pgtable.c | 4 ++-- arch/arm64/kvm/mmu.c | 4 ++-- arch/arm64/kvm/pmu-emul.c | 2 +- arch/arm64/kvm/reset.c | 2 +- arch/arm64/kvm/vgic/vgic-debug.c | 2 +- arch/arm64/kvm/vgic/vgic-init.c| 2 +- arch/arm64/kvm/vgic/vgic-irqfd.c | 2 +- arch/arm64/kvm/vgic/vgic-its.c | 14 +++--- arch/arm64/kvm/vgic/vgic-mmio-v3.c | 2 +- arch/arm64/kvm/vgic/vgic-v4.c | 2 +- 11 files changed, 21 insertions(+), 20 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 7f06ba7..8040874 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -278,9 +278,10 @@ long kvm_arch_dev_ioctl(struct file *filp, struct kvm *kvm_arch_alloc_vm(void) { if (!has_vhe()) - return kzalloc(sizeof(struct kvm), GFP_KERNEL); + return kzalloc(sizeof(struct kvm), GFP_KERNEL_ACCOUNT); - return vzalloc(sizeof(struct kvm)); + return __vmalloc(sizeof(struct kvm), + GFP_KERNEL_ACCOUNT | __GFP_ZERO); } void kvm_arch_free_vm(struct kvm *kvm) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 926fc07..a0845d3 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -366,7 +366,7 @@ static int hyp_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1)) return -EINVAL; - childp = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL); + childp = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!childp) return -ENOMEM; @@ -401,7 +401,7 @@ int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits) { u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits); - pgt->pgd = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL); + pgt->pgd = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!pgt->pgd) return -ENOMEM; diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 8711894..8c9dc49 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -370,7 +370,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) return -EINVAL; } - pgt = kzalloc(sizeof(*pgt), GFP_KERNEL); + pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT); if (!pgt) return -ENOMEM; @@ -1244,7 +1244,7 @@ int kvm_mmu_init(void) goto out; } - hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL); + hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL_ACCOUNT); if (!hyp_pgtable) { kvm_err("Hyp mode page-table not allocated\n"); err = -ENOMEM; diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index e32c6e1..00cf750 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -967,7 +967,7 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) mutex_lock(>kvm->lock); if (!vcpu->kvm->arch.pmu_filter) { - vcpu->kvm->arch.pmu_filter = bitmap_alloc(nr_events, GFP_KERNEL); + vcpu->kvm->arch.pmu_filter = bitmap_alloc(nr_events, GFP_KERNEL_ACCOUNT); if (!vcpu->kvm->arch.pmu_filter) { mutex_unlock(>kvm->lock); return -ENOMEM; diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index bd354cd..3cbcf6b 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -110,7 +110,7 @@ static int kvm_vcpu_finalize_sve(struct kvm_vcpu *vcpu) vl > SVE_VL_ARCH_MAX)) return -EIO; - buf = kzalloc(SVE_SIG_REGS_SIZE(sve_vq_from_vl(vl)), GFP_KERNEL); + buf = kzalloc(SVE_SIG_REGS_SIZE(sve_vq_from_vl(vl)), GFP_KERNEL_ACCOUNT); if (!buf) return -ENOMEM; diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c index f38c40a..e6a01f2 100644 --- a/arch/arm64/kvm/vgic/vgic-debug.c +++ b/arch/arm64/kvm/vgic/vgic-debug.c @@ -92,7 +92,7 @@ static void *vgic_debug_start(struct seq_file *s, loff_t *pos) goto out; } - iter = kmalloc(sizeof(*iter), GFP_KERNEL); + iter = kmalloc(sizeof(*iter), GFP_KERNEL_ACCOUNT); if (!iter) { iter = ERR_PTR(-ENOMEM); goto out; diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 052917d..27d1513 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch
Re: [PATCH] x86/kvm: Fix broken irq restoration in kvm_wait
On Sat, 13 Mar 2021 at 17:33, Paolo Bonzini wrote: > > On 13/03/21 01:57, Wanpeng Li wrote: > >> A third option would be to split the paths. In the end, it's only the > >> ptr/val > >> line that's shared. > > I just sent out a formal patch for my alternative fix, I think the > > whole logic in kvm_wait is more clear w/ my version. > > > > I don't know, having three "if"s in 10 lines of code is a bit daunting. Fair enough, just sent out v3 per Sean's suggestion. Wanpeng
[PATCH v3] x86/kvm: Fix broken irq restoration in kvm_wait
From: Wanpeng Li After commit 997acaf6b4b59c (lockdep: report broken irq restoration), the guest splatting below during boot: raw_local_irq_restore() called with IRQs enabled WARNING: CPU: 1 PID: 169 at kernel/locking/irqflag-debug.c:10 warn_bogus_irq_restore+0x26/0x30 Modules linked in: hid_generic usbhid hid CPU: 1 PID: 169 Comm: systemd-udevd Not tainted 5.11.0+ #25 RIP: 0010:warn_bogus_irq_restore+0x26/0x30 Call Trace: kvm_wait+0x76/0x90 __pv_queued_spin_lock_slowpath+0x285/0x2e0 do_raw_spin_lock+0xc9/0xd0 _raw_spin_lock+0x59/0x70 lockref_get_not_dead+0xf/0x50 __legitimize_path+0x31/0x60 legitimize_root+0x37/0x50 try_to_unlazy_next+0x7f/0x1d0 lookup_fast+0xb0/0x170 path_openat+0x165/0x9b0 do_filp_open+0x99/0x110 do_sys_openat2+0x1f1/0x2e0 do_sys_open+0x5c/0x80 __x64_sys_open+0x21/0x30 do_syscall_64+0x32/0x50 entry_SYSCALL_64_after_hwframe+0x44/0xae The irqflags handling in kvm_wait() which ends up doing: local_irq_save(flags); safe_halt(); local_irq_restore(flags); which triggered a new consistency checking, we generally expect local_irq_save() and local_irq_restore() to be pared and sanely nested, and so local_irq_restore() expects to be called with irqs disabled. This patch fixes it by playing local_irq_disable()/enable() directly. Cc: Mark Rutland Cc: Thomas Gleixner Signed-off-by: Wanpeng Li --- v2 -> v3: * per Sean's suggestion arch/x86/kernel/kvm.c | 23 ++- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5e78e01..72dbb74 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -836,28 +836,25 @@ static void kvm_kick_cpu(int cpu) static void kvm_wait(u8 *ptr, u8 val) { - unsigned long flags; - if (in_nmi()) return; - local_irq_save(flags); - - if (READ_ONCE(*ptr) != val) - goto out; - /* * halt until it's our turn and kicked. Note that we do safe halt * for irq enabled case to avoid hang when lock info is overwritten * in irq spinlock slowpath and no spurious interrupt occur to save us. */ - if (arch_irqs_disabled_flags(flags)) - halt(); - else - safe_halt(); + if (irqs_disabled()) { + if (READ_ONCE(*ptr) == val) + halt(); + } else { + local_irq_disable(); -out: - local_irq_restore(flags); + if (READ_ONCE(*ptr) == val) + safe_halt(); + + local_irq_enable(); + } } #ifdef CONFIG_X86_32 -- 2.7.4
Re: [PATCH] x86/kvm: Fix broken irq restoration in kvm_wait
On Thu, 11 Mar 2021 at 23:54, Sean Christopherson wrote: > > On Tue, Feb 23, 2021, Wanpeng Li wrote: > > On Tue, 23 Feb 2021 at 13:25, Wanpeng Li wrote: > > > > > > From: Wanpeng Li > > > > > > After commit 997acaf6b4b59c (lockdep: report broken irq restoration), the > > > guest > > > splatting below during boot: > > > > > > raw_local_irq_restore() called with IRQs enabled > > > WARNING: CPU: 1 PID: 169 at kernel/locking/irqflag-debug.c:10 > > > warn_bogus_irq_restore+0x26/0x30 > > > Modules linked in: hid_generic usbhid hid > > > CPU: 1 PID: 169 Comm: systemd-udevd Not tainted 5.11.0+ #25 > > > RIP: 0010:warn_bogus_irq_restore+0x26/0x30 > > > Call Trace: > > > kvm_wait+0x76/0x90 > > > __pv_queued_spin_lock_slowpath+0x285/0x2e0 > > > do_raw_spin_lock+0xc9/0xd0 > > > _raw_spin_lock+0x59/0x70 > > > lockref_get_not_dead+0xf/0x50 > > > __legitimize_path+0x31/0x60 > > > legitimize_root+0x37/0x50 > > > try_to_unlazy_next+0x7f/0x1d0 > > > lookup_fast+0xb0/0x170 > > > path_openat+0x165/0x9b0 > > > do_filp_open+0x99/0x110 > > > do_sys_openat2+0x1f1/0x2e0 > > > do_sys_open+0x5c/0x80 > > > __x64_sys_open+0x21/0x30 > > > do_syscall_64+0x32/0x50 > > > entry_SYSCALL_64_after_hwframe+0x44/0xae > > > > > > The irqflags handling in kvm_wait() which ends up doing: > > > > > > local_irq_save(flags); > > > safe_halt(); > > > local_irq_restore(flags); > > > > > > which triggered a new consistency checking, we generally expect > > > local_irq_save() and local_irq_restore() to be pared and sanely > > > nested, and so local_irq_restore() expects to be called with > > > irqs disabled. > > > > > > This patch fixes it by adding a local_irq_disable() after safe_halt() > > > to avoid this warning. > > > > > > Cc: Mark Rutland > > > Cc: Thomas Gleixner > > > Signed-off-by: Wanpeng Li > > > --- > > > arch/x86/kernel/kvm.c | 4 +++- > > > 1 file changed, 3 insertions(+), 1 deletion(-) > > > > > > diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c > > > index 5e78e01..688c84a 100644 > > > --- a/arch/x86/kernel/kvm.c > > > +++ b/arch/x86/kernel/kvm.c > > > @@ -853,8 +853,10 @@ static void kvm_wait(u8 *ptr, u8 val) > > > */ > > > if (arch_irqs_disabled_flags(flags)) > > > halt(); > > > - else > > > + else { > > > safe_halt(); > > > + local_irq_disable(); > > > + } > > > > An alternative fix: > > > > diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c > > index 5e78e01..7127aef 100644 > > --- a/arch/x86/kernel/kvm.c > > +++ b/arch/x86/kernel/kvm.c > > @@ -836,12 +836,13 @@ static void kvm_kick_cpu(int cpu) > > > > static void kvm_wait(u8 *ptr, u8 val) > > { > > -unsigned long flags; > > +bool disabled = irqs_disabled(); > > > > if (in_nmi()) > > return; > > > > -local_irq_save(flags); > > +if (!disabled) > > +local_irq_disable(); > > > > if (READ_ONCE(*ptr) != val) > > goto out; > > @@ -851,13 +852,14 @@ static void kvm_wait(u8 *ptr, u8 val) > > * for irq enabled case to avoid hang when lock info is overwritten > > * in irq spinlock slowpath and no spurious interrupt occur to save us. > > */ > > -if (arch_irqs_disabled_flags(flags)) > > +if (disabled) > > halt(); > > else > > safe_halt(); > > > > out: > > -local_irq_restore(flags); > > +if (!disabled) > > +local_irq_enable(); > > } > > > > #ifdef CONFIG_X86_32 > > A third option would be to split the paths. In the end, it's only the ptr/val > line that's shared. I just sent out a formal patch for my alternative fix, I think the whole logic in kvm_wait is more clear w/ my version. > > --- > arch/x86/kernel/kvm.c | 23 ++- > 1 file changed, 10 insertions(+), 13 deletions(-) > > diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c > index 5e78e01ca3b4..78bb0fae3982 100644 > --- a/arch/x86/kernel/kvm.c > +++ b/arch/x86/kernel/kvm.c > @@ -836,28 +836,25 @@ static void kvm_kick_cpu(int cpu) > > s
[PATCH v2] x86/kvm: Fix broken irq restoration in kvm_wait
From: Wanpeng Li After commit 997acaf6b4b59c (lockdep: report broken irq restoration), the guest splatting below during boot: raw_local_irq_restore() called with IRQs enabled WARNING: CPU: 1 PID: 169 at kernel/locking/irqflag-debug.c:10 warn_bogus_irq_restore+0x26/0x30 Modules linked in: hid_generic usbhid hid CPU: 1 PID: 169 Comm: systemd-udevd Not tainted 5.11.0+ #25 RIP: 0010:warn_bogus_irq_restore+0x26/0x30 Call Trace: kvm_wait+0x76/0x90 __pv_queued_spin_lock_slowpath+0x285/0x2e0 do_raw_spin_lock+0xc9/0xd0 _raw_spin_lock+0x59/0x70 lockref_get_not_dead+0xf/0x50 __legitimize_path+0x31/0x60 legitimize_root+0x37/0x50 try_to_unlazy_next+0x7f/0x1d0 lookup_fast+0xb0/0x170 path_openat+0x165/0x9b0 do_filp_open+0x99/0x110 do_sys_openat2+0x1f1/0x2e0 do_sys_open+0x5c/0x80 __x64_sys_open+0x21/0x30 do_syscall_64+0x32/0x50 entry_SYSCALL_64_after_hwframe+0x44/0xae The irqflags handling in kvm_wait() which ends up doing: local_irq_save(flags); safe_halt(); local_irq_restore(flags); which triggered a new consistency checking, we generally expect local_irq_save() and local_irq_restore() to be pared and sanely nested, and so local_irq_restore() expects to be called with irqs disabled. This patch fixes it by playing local_irq_disable()/enable() directly. Cc: Mark Rutland Cc: Thomas Gleixner Signed-off-by: Wanpeng Li --- v1 -> v2: * select the alternative fix arch/x86/kernel/kvm.c | 10 ++ 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5e78e01..7127aef 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -836,12 +836,13 @@ static void kvm_kick_cpu(int cpu) static void kvm_wait(u8 *ptr, u8 val) { - unsigned long flags; + bool disabled = irqs_disabled(); if (in_nmi()) return; - local_irq_save(flags); + if (!disabled) + local_irq_disable(); if (READ_ONCE(*ptr) != val) goto out; @@ -851,13 +852,14 @@ static void kvm_wait(u8 *ptr, u8 val) * for irq enabled case to avoid hang when lock info is overwritten * in irq spinlock slowpath and no spurious interrupt occur to save us. */ - if (arch_irqs_disabled_flags(flags)) + if (disabled) halt(); else safe_halt(); out: - local_irq_restore(flags); + if (!disabled) + local_irq_enable(); } #ifdef CONFIG_X86_32 -- 2.7.4
[PATCH] KVM: X86: Fix missing local pCPU when executing wbinvd on all dirty pCPUs
From: Wanpeng Li We should execute wbinvd on all dirty pCPUs when guest wbinvd exits to maintain datat consistency in order to deal with noncoherent DMA. smp_call_function_many() does not execute the provided function on the local core, this patch replaces it by on_each_cpu_mask(). Reported-by: Nadav Amit Cc: Nadav Amit Signed-off-by: Wanpeng Li --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 012d5df..aa6d667 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6638,7 +6638,7 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) int cpu = get_cpu(); cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); - smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, + on_each_cpu_mask(vcpu->arch.wbinvd_dirty_mask, wbinvd_ipi, NULL, 1); put_cpu(); cpumask_clear(vcpu->arch.wbinvd_dirty_mask); -- 2.7.4
Re: [PATCH] KVM: x86: Ensure deadline timer has truly expired before posting its IRQ
On Fri, 5 Mar 2021 at 10:19, Sean Christopherson wrote: > > When posting a deadline timer interrupt, open code the checks guarding > __kvm_wait_lapic_expire() in order to skip the lapic_timer_int_injected() > check in kvm_wait_lapic_expire(). The injection check will always fail > since the interrupt has not yet be injected. Moving the call after > injection would also be wrong as that wouldn't actually delay delivery > of the IRQ if it is indeed sent via posted interrupt. > > Fixes: 010fd37fddf6 ("KVM: LAPIC: Reduce world switch latency caused by > timer_advance_ns") > Cc: sta...@vger.kernel.org > Signed-off-by: Sean Christopherson > --- > arch/x86/kvm/lapic.c | 11 ++- > 1 file changed, 10 insertions(+), 1 deletion(-) > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c > index 45d40bfacb7c..cb8ebfaccfb6 100644 > --- a/arch/x86/kvm/lapic.c > +++ b/arch/x86/kvm/lapic.c > @@ -1642,7 +1642,16 @@ static void apic_timer_expired(struct kvm_lapic *apic, > bool from_timer_fn) > } > > if (kvm_use_posted_timer_interrupt(apic->vcpu)) { > - kvm_wait_lapic_expire(vcpu); > + /* > +* Ensure the guest's timer has truly expired before posting > an > +* interrupt. Open code the relevant checks to avoid querying > +* lapic_timer_int_injected(), which will be false since the > +* interrupt isn't yet injected. Waiting until after > injecting > +* is not an option since that won't help a posted interrupt. > +*/ > + if (vcpu->arch.apic->lapic_timer.expired_tscdeadline && > + vcpu->arch.apic->lapic_timer.timer_advance_ns) > + __kvm_wait_lapic_expire(vcpu); Thanks for the fixing. Wanpeng
Re: [PATCH] x86/kvm: Fix broken irq restoration in kvm_wait
ping, On Tue, 23 Feb 2021 at 13:25, Wanpeng Li wrote: > > From: Wanpeng Li > > After commit 997acaf6b4b59c (lockdep: report broken irq restoration), the > guest > splatting below during boot: > > raw_local_irq_restore() called with IRQs enabled > WARNING: CPU: 1 PID: 169 at kernel/locking/irqflag-debug.c:10 > warn_bogus_irq_restore+0x26/0x30 > Modules linked in: hid_generic usbhid hid > CPU: 1 PID: 169 Comm: systemd-udevd Not tainted 5.11.0+ #25 > RIP: 0010:warn_bogus_irq_restore+0x26/0x30 > Call Trace: > kvm_wait+0x76/0x90 > __pv_queued_spin_lock_slowpath+0x285/0x2e0 > do_raw_spin_lock+0xc9/0xd0 > _raw_spin_lock+0x59/0x70 > lockref_get_not_dead+0xf/0x50 > __legitimize_path+0x31/0x60 > legitimize_root+0x37/0x50 > try_to_unlazy_next+0x7f/0x1d0 > lookup_fast+0xb0/0x170 > path_openat+0x165/0x9b0 > do_filp_open+0x99/0x110 > do_sys_openat2+0x1f1/0x2e0 > do_sys_open+0x5c/0x80 > __x64_sys_open+0x21/0x30 > do_syscall_64+0x32/0x50 > entry_SYSCALL_64_after_hwframe+0x44/0xae > > The irqflags handling in kvm_wait() which ends up doing: > > local_irq_save(flags); > safe_halt(); > local_irq_restore(flags); > > which triggered a new consistency checking, we generally expect > local_irq_save() and local_irq_restore() to be pared and sanely > nested, and so local_irq_restore() expects to be called with > irqs disabled. > > This patch fixes it by adding a local_irq_disable() after safe_halt() > to avoid this warning. > > Cc: Mark Rutland > Cc: Thomas Gleixner > Signed-off-by: Wanpeng Li > --- > arch/x86/kernel/kvm.c | 4 +++- > 1 file changed, 3 insertions(+), 1 deletion(-) > > diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c > index 5e78e01..688c84a 100644 > --- a/arch/x86/kernel/kvm.c > +++ b/arch/x86/kernel/kvm.c > @@ -853,8 +853,10 @@ static void kvm_wait(u8 *ptr, u8 val) > */ > if (arch_irqs_disabled_flags(flags)) > halt(); > - else > + else { > safe_halt(); > + local_irq_disable(); > + } > > out: > local_irq_restore(flags); > -- > 2.7.4 >
Re: [PATCH v2] KVM: LAPIC: Advancing the timer expiration on guest initiated write
ping, :) On Thu, 4 Mar 2021 at 08:35, Wanpeng Li wrote: > > From: Wanpeng Li > > Advancing the timer expiration should only be necessary on guest initiated > writes. When we cancel the timer and clear .pending during state restore, > clear expired_tscdeadline as well. > > Reviewed-by: Sean Christopherson > Signed-off-by: Wanpeng Li > --- > v1 -> v2: > * update patch description > > arch/x86/kvm/lapic.c | 1 + > 1 file changed, 1 insertion(+) > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c > index 45d40bf..f2b6e79 100644 > --- a/arch/x86/kvm/lapic.c > +++ b/arch/x86/kvm/lapic.c > @@ -2595,6 +2595,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct > kvm_lapic_state *s) > > apic_update_ppr(apic); > hrtimer_cancel(>lapic_timer.timer); > + apic->lapic_timer.expired_tscdeadline = 0; > apic_update_lvtt(apic); > apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); > update_divide_count(apic); > -- > 2.7.4 >
Re: [PATCH v4] KVM: kvmclock: Fix vCPUs > 64 can't be online/hotpluged
ping, :) On Wed, 24 Feb 2021 at 09:38, Wanpeng Li wrote: > > From: Wanpeng Li > > # lscpu > Architecture: x86_64 > CPU op-mode(s):32-bit, 64-bit > Byte Order:Little Endian > CPU(s):88 > On-line CPU(s) list: 0-63 > Off-line CPU(s) list: 64-87 > > # cat /proc/cmdline > BOOT_IMAGE=/vmlinuz-5.10.0-rc3-tlinux2-0050+ root=/dev/mapper/cl-root ro > rd.lvm.lv=cl/root rhgb quiet console=ttyS0 LANG=en_US .UTF-8 > no-kvmclock-vsyscall > > # echo 1 > /sys/devices/system/cpu/cpu76/online > -bash: echo: write error: Cannot allocate memory > > The per-cpu vsyscall pvclock data pointer assigns either an element of the > static array hv_clock_boot (#vCPU <= 64) or dynamically allocated memory > hvclock_mem (vCPU > 64), the dynamically memory will not be allocated if > kvmclock vsyscall is disabled, this can result in cpu hotpluged fails in > kvmclock_setup_percpu() which returns -ENOMEM. It's broken for no-vsyscall > and sometimes you end up with vsyscall disabled if the host does something > strange. This patch fixes it by allocating this dynamically memory > unconditionally even if vsyscall is disabled. > > Fixes: 6a1cac56f4 ("x86/kvm: Use __bss_decrypted attribute in shared > variables") > Reported-by: Zelin Deng > Cc: Brijesh Singh > Cc: sta...@vger.kernel.org#v4.19-rc5+ > Signed-off-by: Wanpeng Li > --- > v3 -> v4: > * fix kernel test robot report WARNING > v2 -> v3: > * allocate dynamically memory unconditionally > v1 -> v2: > * add code comments > > arch/x86/kernel/kvmclock.c | 19 +-- > 1 file changed, 9 insertions(+), 10 deletions(-) > > diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c > index aa59374..1fc0962 100644 > --- a/arch/x86/kernel/kvmclock.c > +++ b/arch/x86/kernel/kvmclock.c > @@ -268,21 +268,20 @@ static void __init kvmclock_init_mem(void) > > static int __init kvm_setup_vsyscall_timeinfo(void) > { > -#ifdef CONFIG_X86_64 > - u8 flags; > + kvmclock_init_mem(); > > - if (!per_cpu(hv_clock_per_cpu, 0) || !kvmclock_vsyscall) > - return 0; > +#ifdef CONFIG_X86_64 > + if (per_cpu(hv_clock_per_cpu, 0) && kvmclock_vsyscall) { > + u8 flags; > > - flags = pvclock_read_flags(_clock_boot[0].pvti); > - if (!(flags & PVCLOCK_TSC_STABLE_BIT)) > - return 0; > + flags = pvclock_read_flags(_clock_boot[0].pvti); > + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) > + return 0; > > - kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; > + kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; > + } > #endif > > - kvmclock_init_mem(); > - > return 0; > } > early_initcall(kvm_setup_vsyscall_timeinfo); > -- > 2.7.4 >
[PATCH v2] KVM: LAPIC: Advancing the timer expiration on guest initiated write
From: Wanpeng Li Advancing the timer expiration should only be necessary on guest initiated writes. When we cancel the timer and clear .pending during state restore, clear expired_tscdeadline as well. Reviewed-by: Sean Christopherson Signed-off-by: Wanpeng Li --- v1 -> v2: * update patch description arch/x86/kvm/lapic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 45d40bf..f2b6e79 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2595,6 +2595,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) apic_update_ppr(apic); hrtimer_cancel(>lapic_timer.timer); + apic->lapic_timer.expired_tscdeadline = 0; apic_update_lvtt(apic); apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); update_divide_count(apic); -- 2.7.4
Re: [PATCH] KVM: LAPIC: Advancing the timer expiration on guest initiated write
On Wed, 3 Mar 2021 at 01:16, Sean Christopherson wrote: > > On Tue, Mar 02, 2021, Wanpeng Li wrote: > > From: Wanpeng Li > > > > Advancing the timer expiration should only be necessary on guest initiated > > writes. Now, we cancel the timer, clear .pending and clear > > expired_tscdeadline > > at the same time during state restore. > > That last sentence is confusing. kvm_apic_set_state() already clears > .pending, > by way of __start_apic_timer(). I think what you mean is: > > When we cancel the timer and clear .pending during state restore, clear > expired_tscdeadline as well. Good statement. :) > > With that, > > Reviewed-by: Sean Christopherson > > > Side topic, I think there's a theoretical bug where KVM could inject a > spurious > timer interrupt. If KVM is using hrtimer, the hrtimer expires early due to an > overzealous timer_advance_ns, and the guest writes MSR_TSCDEADLINE after the > hrtimer expires but before the vCPU is kicked, then KVM will inject a spurious > timer IRQ since the premature expiration should have been canceled by the > guest's > WRMSR. > > It could also cause KVM to soft hang the guest if the new > lapic_timer.tscdeadline > is written before apic_timer_expired() captures it in expired_tscdeadline. In > that case, KVM will wait for the new deadline, which could be far in the > future. The hrtimer_cancel() before setting new lapic_timer.tscdeadline in kvm_set_lapic_tscdeadline_msr() will wait for the hrtimer callback function to finish. Could it solve this issue? Wanpeng
[PATCH] KVM: LAPIC: Advancing the timer expiration on guest initiated write
From: Wanpeng Li Advancing the timer expiration should only be necessary on guest initiated writes. Now, we cancel the timer, clear .pending and clear expired_tscdeadline at the same time during state restore. Signed-off-by: Wanpeng Li --- arch/x86/kvm/lapic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 45d40bf..f2b6e79 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2595,6 +2595,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) apic_update_ppr(apic); hrtimer_cancel(>lapic_timer.timer); + apic->lapic_timer.expired_tscdeadline = 0; apic_update_lvtt(apic); apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); update_divide_count(apic); -- 2.7.4
[PATCH] KVM: x86: hyper-v: Fix Hyper-V context null-ptr-deref
From: Wanpeng Li Reported by syzkaller: KASAN: null-ptr-deref in range [0x0140-0x0147] CPU: 1 PID: 8370 Comm: syz-executor859 Not tainted 5.11.0-syzkaller #0 RIP: 0010:synic_get arch/x86/kvm/hyperv.c:165 [inline] RIP: 0010:kvm_hv_set_sint_gsi arch/x86/kvm/hyperv.c:475 [inline] RIP: 0010:kvm_hv_irq_routing_update+0x230/0x460 arch/x86/kvm/hyperv.c:498 Call Trace: kvm_set_irq_routing+0x69b/0x940 arch/x86/kvm/../../../virt/kvm/irqchip.c:223 kvm_vm_ioctl+0x12d0/0x2800 arch/x86/kvm/../../../virt/kvm/kvm_main.c:3959 vfs_ioctl fs/ioctl.c:48 [inline] __do_sys_ioctl fs/ioctl.c:753 [inline] __se_sys_ioctl fs/ioctl.c:739 [inline] __x64_sys_ioctl+0x193/0x200 fs/ioctl.c:739 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xae Hyper-V context is lazily allocated until Hyper-V specific MSRs are accessed or SynIC is enabled. However, the syzkaller testcase sets irq routing table directly w/o enabling SynIC. This results in null-ptr-deref when accessing SynIC Hyper-V context. This patch fixes it. syzkaller source: https://syzkaller.appspot.com/x/repro.c?x=163342ccd0 Reported-by: syzbot+6987f3b2dbd9eda95...@syzkaller.appspotmail.com Fixes: 8f014550dfb1 ("KVM: x86: hyper-v: Make Hyper-V emulation enablement conditional") Signed-off-by: Wanpeng Li --- arch/x86/kvm/hyperv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 7d2dae9..58fa8c0 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -159,7 +159,7 @@ static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vpidx) struct kvm_vcpu_hv_synic *synic; vcpu = get_vcpu_by_vpidx(kvm, vpidx); - if (!vcpu) + if (!vcpu || !to_hv_vcpu(vcpu)) return NULL; synic = to_hv_synic(vcpu); return (synic->active) ? synic : NULL; -- 2.7.4
Re: general protection fault in kvm_hv_irq_routing_update
On Fri, 26 Feb 2021 at 15:01, syzbot wrote: > > Hello, > > syzbot found the following issue on: > > HEAD commit:a99163e9 Merge tag 'devicetree-for-5.12' of git://git.kern.. > git tree: upstream > console output: https://syzkaller.appspot.com/x/log.txt?x=12d72682d0 > kernel config: https://syzkaller.appspot.com/x/.config?x=7a875029a795d230 > dashboard link: https://syzkaller.appspot.com/bug?extid=6987f3b2dbd9eda95f12 > syz repro: https://syzkaller.appspot.com/x/repro.syz?x=12faef12d0 > C reproducer: https://syzkaller.appspot.com/x/repro.c?x=163342ccd0 > > The issue was bisected to: > > commit 8f014550dfb114cc7f42a517d20d2cf887a0b771 > Author: Vitaly Kuznetsov > Date: Tue Jan 26 13:48:14 2021 + > > KVM: x86: hyper-v: Make Hyper-V emulation enablement conditional > > bisection log: https://syzkaller.appspot.com/x/bisect.txt?x=10df16a8d0 > final oops: https://syzkaller.appspot.com/x/report.txt?x=12df16a8d0 > console output: https://syzkaller.appspot.com/x/log.txt?x=14df16a8d0 > > IMPORTANT: if you fix the issue, please add the following tag to the commit: > Reported-by: syzbot+6987f3b2dbd9eda95...@syzkaller.appspotmail.com > Fixes: 8f014550dfb1 ("KVM: x86: hyper-v: Make Hyper-V emulation enablement > conditional") > > L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and > https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for > details. > general protection fault, probably for non-canonical address > 0xdc28: [#1] PREEMPT SMP KASAN > KASAN: null-ptr-deref in range [0x0140-0x0147] > CPU: 1 PID: 8370 Comm: syz-executor859 Not tainted 5.11.0-syzkaller #0 > Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS > Google 01/01/2011 > RIP: 0010:synic_get arch/x86/kvm/hyperv.c:165 [inline] > RIP: 0010:kvm_hv_set_sint_gsi arch/x86/kvm/hyperv.c:475 [inline] > RIP: 0010:kvm_hv_irq_routing_update+0x230/0x460 arch/x86/kvm/hyperv.c:498 > Code: 80 19 00 00 48 89 f8 48 c1 e8 03 80 3c 28 00 0f 85 ff 01 00 00 4d 8b ad > 80 19 00 00 49 8d bd 40 01 00 00 48 89 f8 48 c1 e8 03 <0f> b6 04 28 84 c0 74 > 06 0f 8e d2 01 00 00 45 0f b6 bd 40 01 00 00 > RSP: 0018:c90001b3fac0 EFLAGS: 00010206 > RAX: 0028 RBX: 888012df5900 RCX: > RDX: 888022193780 RSI: 81174d43 RDI: 0140 > RBP: dc00 R08: R09: c900018819eb > R10: 81170f3e R11: R12: > R13: R14: R15: 0001 > FS: 00a73300() GS:8880b9d0() knlGS: > CS: 0010 DS: ES: CR0: 80050033 > CR2: 557e8c876888 CR3: 13c0b000 CR4: 001526e0 > DR0: DR1: DR2: > DR3: DR6: fffe0ff0 DR7: 0400 > Call Trace: > kvm_set_irq_routing+0x69b/0x940 arch/x86/kvm/../../../virt/kvm/irqchip.c:223 > kvm_vm_ioctl+0x12d0/0x2800 arch/x86/kvm/../../../virt/kvm/kvm_main.c:3959 > vfs_ioctl fs/ioctl.c:48 [inline] > __do_sys_ioctl fs/ioctl.c:753 [inline] > __se_sys_ioctl fs/ioctl.c:739 [inline] > __x64_sys_ioctl+0x193/0x200 fs/ioctl.c:739 > do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 > entry_SYSCALL_64_after_hwframe+0x44/0xae > RIP: 0033:0x43ef29 > Code: 28 c3 e8 2a 14 00 00 66 2e 0f 1f 84 00 00 00 00 00 48 89 f8 48 89 f7 48 > 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 > 01 c3 48 c7 c1 c0 ff ff ff f7 d8 64 89 01 48 > RSP: 002b:7ffe391eb808 EFLAGS: 0246 ORIG_RAX: 0010 > RAX: ffda RBX: 00400488 RCX: 0043ef29 > RDX: 2140 RSI: 4008ae6a RDI: 0004 > RBP: 00402f10 R08: 00400488 R09: 00400488 > R10: 00400488 R11: 0246 R12: 00402fa0 > R13: R14: 004ac018 R15: 00400488 > Modules linked in: > ---[ end trace 2aa75ec1dd148710 ]--- > RIP: 0010:synic_get arch/x86/kvm/hyperv.c:165 [inline] Missing check to_hv_vcpu(vcpu) in synic_get(), I will fix it. > RIP: 0010:kvm_hv_set_sint_gsi arch/x86/kvm/hyperv.c:475 [inline] > RIP: 0010:kvm_hv_irq_routing_update+0x230/0x460 arch/x86/kvm/hyperv.c:498 > Code: 80 19 00 00 48 89 f8 48 c1 e8 03 80 3c 28 00 0f 85 ff 01 00 00 4d 8b ad > 80 19 00 00 49 8d bd 40 01 00 00 48 89 f8 48 c1 e8 03 <0f> b6 04 28 84 c0 74 > 06 0f 8e d2 01 00 00 45 0f b6 bd 40 01 00 00 > RSP: 0018:c90001b3fac0 EFLAGS: 00010206 > RAX: 0028 RBX: 888012df5900 RCX: > RDX: 888022193780 RSI: 81174d43 RDI: 0140 > RBP: dc00 R08: R09: c900018819eb > R10: 81170f3e R11: R12: > R13: R14: R15: 0001 > FS: 00a73300() GS:8880b9d0()
[PATCH v4] KVM: kvmclock: Fix vCPUs > 64 can't be online/hotpluged
From: Wanpeng Li # lscpu Architecture: x86_64 CPU op-mode(s):32-bit, 64-bit Byte Order:Little Endian CPU(s):88 On-line CPU(s) list: 0-63 Off-line CPU(s) list: 64-87 # cat /proc/cmdline BOOT_IMAGE=/vmlinuz-5.10.0-rc3-tlinux2-0050+ root=/dev/mapper/cl-root ro rd.lvm.lv=cl/root rhgb quiet console=ttyS0 LANG=en_US .UTF-8 no-kvmclock-vsyscall # echo 1 > /sys/devices/system/cpu/cpu76/online -bash: echo: write error: Cannot allocate memory The per-cpu vsyscall pvclock data pointer assigns either an element of the static array hv_clock_boot (#vCPU <= 64) or dynamically allocated memory hvclock_mem (vCPU > 64), the dynamically memory will not be allocated if kvmclock vsyscall is disabled, this can result in cpu hotpluged fails in kvmclock_setup_percpu() which returns -ENOMEM. It's broken for no-vsyscall and sometimes you end up with vsyscall disabled if the host does something strange. This patch fixes it by allocating this dynamically memory unconditionally even if vsyscall is disabled. Fixes: 6a1cac56f4 ("x86/kvm: Use __bss_decrypted attribute in shared variables") Reported-by: Zelin Deng Cc: Brijesh Singh Cc: sta...@vger.kernel.org#v4.19-rc5+ Signed-off-by: Wanpeng Li --- v3 -> v4: * fix kernel test robot report WARNING v2 -> v3: * allocate dynamically memory unconditionally v1 -> v2: * add code comments arch/x86/kernel/kvmclock.c | 19 +-- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index aa59374..1fc0962 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -268,21 +268,20 @@ static void __init kvmclock_init_mem(void) static int __init kvm_setup_vsyscall_timeinfo(void) { -#ifdef CONFIG_X86_64 - u8 flags; + kvmclock_init_mem(); - if (!per_cpu(hv_clock_per_cpu, 0) || !kvmclock_vsyscall) - return 0; +#ifdef CONFIG_X86_64 + if (per_cpu(hv_clock_per_cpu, 0) && kvmclock_vsyscall) { + u8 flags; - flags = pvclock_read_flags(_clock_boot[0].pvti); - if (!(flags & PVCLOCK_TSC_STABLE_BIT)) - return 0; + flags = pvclock_read_flags(_clock_boot[0].pvti); + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) + return 0; - kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; + kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; + } #endif - kvmclock_init_mem(); - return 0; } early_initcall(kvm_setup_vsyscall_timeinfo); -- 2.7.4
Re: [PATCH] x86/kvm: Fix broken irq restoration in kvm_wait
On Tue, 23 Feb 2021 at 13:25, Wanpeng Li wrote: > > From: Wanpeng Li > > After commit 997acaf6b4b59c (lockdep: report broken irq restoration), the > guest > splatting below during boot: > > raw_local_irq_restore() called with IRQs enabled > WARNING: CPU: 1 PID: 169 at kernel/locking/irqflag-debug.c:10 > warn_bogus_irq_restore+0x26/0x30 > Modules linked in: hid_generic usbhid hid > CPU: 1 PID: 169 Comm: systemd-udevd Not tainted 5.11.0+ #25 > RIP: 0010:warn_bogus_irq_restore+0x26/0x30 > Call Trace: > kvm_wait+0x76/0x90 > __pv_queued_spin_lock_slowpath+0x285/0x2e0 > do_raw_spin_lock+0xc9/0xd0 > _raw_spin_lock+0x59/0x70 > lockref_get_not_dead+0xf/0x50 > __legitimize_path+0x31/0x60 > legitimize_root+0x37/0x50 > try_to_unlazy_next+0x7f/0x1d0 > lookup_fast+0xb0/0x170 > path_openat+0x165/0x9b0 > do_filp_open+0x99/0x110 > do_sys_openat2+0x1f1/0x2e0 > do_sys_open+0x5c/0x80 > __x64_sys_open+0x21/0x30 > do_syscall_64+0x32/0x50 > entry_SYSCALL_64_after_hwframe+0x44/0xae > > The irqflags handling in kvm_wait() which ends up doing: > > local_irq_save(flags); > safe_halt(); > local_irq_restore(flags); > > which triggered a new consistency checking, we generally expect > local_irq_save() and local_irq_restore() to be pared and sanely > nested, and so local_irq_restore() expects to be called with > irqs disabled. > > This patch fixes it by adding a local_irq_disable() after safe_halt() > to avoid this warning. > > Cc: Mark Rutland > Cc: Thomas Gleixner > Signed-off-by: Wanpeng Li > --- > arch/x86/kernel/kvm.c | 4 +++- > 1 file changed, 3 insertions(+), 1 deletion(-) > > diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c > index 5e78e01..688c84a 100644 > --- a/arch/x86/kernel/kvm.c > +++ b/arch/x86/kernel/kvm.c > @@ -853,8 +853,10 @@ static void kvm_wait(u8 *ptr, u8 val) > */ > if (arch_irqs_disabled_flags(flags)) > halt(); > - else > + else { > safe_halt(); > + local_irq_disable(); > + } An alternative fix: diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5e78e01..7127aef 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -836,12 +836,13 @@ static void kvm_kick_cpu(int cpu) static void kvm_wait(u8 *ptr, u8 val) { -unsigned long flags; +bool disabled = irqs_disabled(); if (in_nmi()) return; -local_irq_save(flags); +if (!disabled) +local_irq_disable(); if (READ_ONCE(*ptr) != val) goto out; @@ -851,13 +852,14 @@ static void kvm_wait(u8 *ptr, u8 val) * for irq enabled case to avoid hang when lock info is overwritten * in irq spinlock slowpath and no spurious interrupt occur to save us. */ -if (arch_irqs_disabled_flags(flags)) +if (disabled) halt(); else safe_halt(); out: -local_irq_restore(flags); +if (!disabled) +local_irq_enable(); } #ifdef CONFIG_X86_32
[PATCH] x86/kvm: Fix broken irq restoration in kvm_wait
From: Wanpeng Li After commit 997acaf6b4b59c (lockdep: report broken irq restoration), the guest splatting below during boot: raw_local_irq_restore() called with IRQs enabled WARNING: CPU: 1 PID: 169 at kernel/locking/irqflag-debug.c:10 warn_bogus_irq_restore+0x26/0x30 Modules linked in: hid_generic usbhid hid CPU: 1 PID: 169 Comm: systemd-udevd Not tainted 5.11.0+ #25 RIP: 0010:warn_bogus_irq_restore+0x26/0x30 Call Trace: kvm_wait+0x76/0x90 __pv_queued_spin_lock_slowpath+0x285/0x2e0 do_raw_spin_lock+0xc9/0xd0 _raw_spin_lock+0x59/0x70 lockref_get_not_dead+0xf/0x50 __legitimize_path+0x31/0x60 legitimize_root+0x37/0x50 try_to_unlazy_next+0x7f/0x1d0 lookup_fast+0xb0/0x170 path_openat+0x165/0x9b0 do_filp_open+0x99/0x110 do_sys_openat2+0x1f1/0x2e0 do_sys_open+0x5c/0x80 __x64_sys_open+0x21/0x30 do_syscall_64+0x32/0x50 entry_SYSCALL_64_after_hwframe+0x44/0xae The irqflags handling in kvm_wait() which ends up doing: local_irq_save(flags); safe_halt(); local_irq_restore(flags); which triggered a new consistency checking, we generally expect local_irq_save() and local_irq_restore() to be pared and sanely nested, and so local_irq_restore() expects to be called with irqs disabled. This patch fixes it by adding a local_irq_disable() after safe_halt() to avoid this warning. Cc: Mark Rutland Cc: Thomas Gleixner Signed-off-by: Wanpeng Li --- arch/x86/kernel/kvm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5e78e01..688c84a 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -853,8 +853,10 @@ static void kvm_wait(u8 *ptr, u8 val) */ if (arch_irqs_disabled_flags(flags)) halt(); - else + else { safe_halt(); + local_irq_disable(); + } out: local_irq_restore(flags); -- 2.7.4
[PATCH v3] KVM: kvmclock: Fix vCPUs > 64 can't be online/hotpluged
From: Wanpeng Li The per-cpu vsyscall pvclock data pointer assigns either an element of the static array hv_clock_boot (#vCPU <= 64) or dynamically allocated memory hvclock_mem (vCPU > 64), the dynamically memory will not be allocated if kvmclock vsyscall is disabled, this can result in cpu hotpluged fails in kvmclock_setup_percpu() which returns -ENOMEM. It's broken for no-vsyscall and sometimes you end up with vsyscall disabled if the host does something strange. This patch fixes it by allocating this dynamically memory unconditionally even if vsyscall is disabled. Fixes: 6a1cac56f4 ("x86/kvm: Use __bss_decrypted attribute in shared variables") Reported-by: Zelin Deng Tested-by: Haiwei Li Cc: Brijesh Singh Cc: sta...@vger.kernel.org#v4.19-rc5+ Signed-off-by: Wanpeng Li --- v2 -> v3: * allocate dynamically memory unconditionally v1 -> v2: * add code comments arch/x86/kernel/kvmclock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index aa59374..a72b16e 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -268,6 +268,8 @@ static void __init kvmclock_init_mem(void) static int __init kvm_setup_vsyscall_timeinfo(void) { + kvmclock_init_mem(); + #ifdef CONFIG_X86_64 u8 flags; @@ -281,8 +283,6 @@ static int __init kvm_setup_vsyscall_timeinfo(void) kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; #endif - kvmclock_init_mem(); - return 0; } early_initcall(kvm_setup_vsyscall_timeinfo); -- 2.7.4
Re: [PATCH v2] KVM: kvmclock: Fix vCPUs > 64 can't be online/hotpluged
On Wed, 27 Jan 2021 at 08:28, Wanpeng Li wrote: > > On Wed, 27 Jan 2021 at 01:26, Paolo Bonzini wrote: > > > > On 26/01/21 02:28, Wanpeng Li wrote: > > > ping, > > > On Mon, 18 Jan 2021 at 17:08, Wanpeng Li wrote: > > >> > > >> From: Wanpeng Li > > >> > > >> The per-cpu vsyscall pvclock data pointer assigns either an element of > > >> the > > >> static array hv_clock_boot (#vCPU <= 64) or dynamically allocated memory > > >> hvclock_mem (vCPU > 64), the dynamically memory will not be allocated if > > >> kvmclock vsyscall is disabled, this can result in cpu hotpluged fails in > > >> kvmclock_setup_percpu() which returns -ENOMEM. This patch fixes it by not > > >> assigning vsyscall pvclock data pointer if kvmclock vdso_clock_mode is > > >> not > > >> VDSO_CLOCKMODE_PVCLOCK. > > > > I am sorry, I still cannot figure out this patch. > > > > Is hotplug still broken if kvm vsyscall is enabled? > > Just when kvm vsyscall is disabled. :) > > # lscpu > Architecture: x86_64 > CPU op-mode(s):32-bit, 64-bit > Byte Order: Little Endian > CPU(s): 88 > On-line CPU(s) list: 0-63 > Off-line CPU(s) list: 64-87 > > # cat /proc/cmdline > BOOT_IMAGE=/vmlinuz-5.10.0-rc3-tlinux2-0050+ root=/dev/mapper/cl-root > ro rd.lvm.lv=cl/root rhgb quiet console=ttyS0 LANG=en_US > .UTF-8 no-kvmclock-vsyscall > > # echo 1 > /sys/devices/system/cpu/cpu76/online > -bash: echo: write error: Cannot allocate memory The original bug report is here. https://bugzilla.kernel.org/show_bug.cgi?id=210213 Wanpeng
Re: [PATCH v2] KVM: kvmclock: Fix vCPUs > 64 can't be online/hotpluged
On Wed, 27 Jan 2021 at 01:26, Paolo Bonzini wrote: > > On 26/01/21 02:28, Wanpeng Li wrote: > > ping, > > On Mon, 18 Jan 2021 at 17:08, Wanpeng Li wrote: > >> > >> From: Wanpeng Li > >> > >> The per-cpu vsyscall pvclock data pointer assigns either an element of the > >> static array hv_clock_boot (#vCPU <= 64) or dynamically allocated memory > >> hvclock_mem (vCPU > 64), the dynamically memory will not be allocated if > >> kvmclock vsyscall is disabled, this can result in cpu hotpluged fails in > >> kvmclock_setup_percpu() which returns -ENOMEM. This patch fixes it by not > >> assigning vsyscall pvclock data pointer if kvmclock vdso_clock_mode is not > >> VDSO_CLOCKMODE_PVCLOCK. > > I am sorry, I still cannot figure out this patch. > > Is hotplug still broken if kvm vsyscall is enabled? Just when kvm vsyscall is disabled. :) # lscpu Architecture: x86_64 CPU op-mode(s):32-bit, 64-bit Byte Order: Little Endian CPU(s): 88 On-line CPU(s) list: 0-63 Off-line CPU(s) list: 64-87 # cat /proc/cmdline BOOT_IMAGE=/vmlinuz-5.10.0-rc3-tlinux2-0050+ root=/dev/mapper/cl-root ro rd.lvm.lv=cl/root rhgb quiet console=ttyS0 LANG=en_US .UTF-8 no-kvmclock-vsyscall # echo 1 > /sys/devices/system/cpu/cpu76/online -bash: echo: write error: Cannot allocate memory Wanpeng
Re: [PATCH v2] KVM: kvmclock: Fix vCPUs > 64 can't be online/hotpluged
ping, On Mon, 18 Jan 2021 at 17:08, Wanpeng Li wrote: > > From: Wanpeng Li > > The per-cpu vsyscall pvclock data pointer assigns either an element of the > static array hv_clock_boot (#vCPU <= 64) or dynamically allocated memory > hvclock_mem (vCPU > 64), the dynamically memory will not be allocated if > kvmclock vsyscall is disabled, this can result in cpu hotpluged fails in > kvmclock_setup_percpu() which returns -ENOMEM. This patch fixes it by not > assigning vsyscall pvclock data pointer if kvmclock vdso_clock_mode is not > VDSO_CLOCKMODE_PVCLOCK. > > Fixes: 6a1cac56f4 ("x86/kvm: Use __bss_decrypted attribute in shared > variables") > Reported-by: Zelin Deng > Tested-by: Haiwei Li > Cc: Brijesh Singh > Cc: sta...@vger.kernel.org#v4.19-rc5+ > Signed-off-by: Wanpeng Li > --- > v1 -> v2: > * add code comments > > arch/x86/kernel/kvmclock.c | 6 -- > 1 file changed, 4 insertions(+), 2 deletions(-) > > diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c > index aa59374..01d4e55c 100644 > --- a/arch/x86/kernel/kvmclock.c > +++ b/arch/x86/kernel/kvmclock.c > @@ -294,9 +294,11 @@ static int kvmclock_setup_percpu(unsigned int cpu) > /* > * The per cpu area setup replicates CPU0 data to all cpu > * pointers. So carefully check. CPU0 has been set up in init > -* already. > +* already. Assign vsyscall pvclock data pointer iff kvmclock > +* vsyscall is enabled. > */ > - if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0))) > + if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0)) || > + (kvm_clock.vdso_clock_mode != VDSO_CLOCKMODE_PVCLOCK)) > return 0; > > /* Use the static page for the first CPUs, allocate otherwise */ > -- > 2.7.4 >
Re: [PATCH] Revert "KVM: x86: Unconditionally enable irqs in guest context"
On Fri, 15 Jan 2021 at 11:20, Wanpeng Li wrote: > > On Wed, 6 Jan 2021 at 08:51, Sean Christopherson wrote: > > > > +tglx > > > > On Tue, Jan 05, 2021, Nitesh Narayan Lal wrote: > > > This reverts commit d7a08882a0a4b4e176691331ee3f492996579534. > > > > > > After the introduction of the patch: > > > > > > 87fa7f3e9: x86/kvm: Move context tracking where it belongs > > > > > > since we have moved guest_exit_irqoff closer to the VM-Exit, explicit > > > enabling of irqs to process pending interrupts should not be required > > > within vcpu_enter_guest anymore. > > > > Ugh, except that commit completely broke tick-based accounting, on both > > Intel > > and AMD. With guest_exit_irqoff() being called immediately after VM-Exit, > > any > > tick that happens after IRQs are disabled will be accounted to the host. > > E.g. > > on Intel, even an IRQ VM-Exit that has already been acked by the CPU isn't > > processed until kvm_x86_ops.handle_exit_irqoff(), well after PF_VCPU has > > been > > cleared. > > > > This issue can be 100% reproduced. > https://bugzilla.kernel.org/show_bug.cgi?id=204177 Sorry, the posted link should be https://bugzilla.kernel.org/show_bug.cgi?id=209831 Wanpeng
Re: [PATCH] KVM: kvmclock: Fix vCPUs > 64 can't be online/hotpluged
On Tue, 19 Jan 2021 at 02:27, Paolo Bonzini wrote: > > On 15/01/21 02:15, Wanpeng Li wrote: > >> The comment above should probably be updated as it is not clear why we > >> check kvm_clock.vdso_clock_mode here. Actually, I would even suggest we > >> introduce a 'kvmclock_tsc_stable' global instead to avoid this indirect > >> check. > > I prefer to update the comment above, assign vsyscall pvclock data > > pointer iff kvmclock vsyscall is enabled. > > Are you going to send v2? Yes. :) https://lore.kernel.org/kvm/1610960877-3110-1-git-send-email-wanpen...@tencent.com/ Wanpeng
[PATCH v2] KVM: kvmclock: Fix vCPUs > 64 can't be online/hotpluged
From: Wanpeng Li The per-cpu vsyscall pvclock data pointer assigns either an element of the static array hv_clock_boot (#vCPU <= 64) or dynamically allocated memory hvclock_mem (vCPU > 64), the dynamically memory will not be allocated if kvmclock vsyscall is disabled, this can result in cpu hotpluged fails in kvmclock_setup_percpu() which returns -ENOMEM. This patch fixes it by not assigning vsyscall pvclock data pointer if kvmclock vdso_clock_mode is not VDSO_CLOCKMODE_PVCLOCK. Fixes: 6a1cac56f4 ("x86/kvm: Use __bss_decrypted attribute in shared variables") Reported-by: Zelin Deng Tested-by: Haiwei Li Cc: Brijesh Singh Cc: sta...@vger.kernel.org#v4.19-rc5+ Signed-off-by: Wanpeng Li --- v1 -> v2: * add code comments arch/x86/kernel/kvmclock.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index aa59374..01d4e55c 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -294,9 +294,11 @@ static int kvmclock_setup_percpu(unsigned int cpu) /* * The per cpu area setup replicates CPU0 data to all cpu * pointers. So carefully check. CPU0 has been set up in init -* already. +* already. Assign vsyscall pvclock data pointer iff kvmclock +* vsyscall is enabled. */ - if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0))) + if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0)) || + (kvm_clock.vdso_clock_mode != VDSO_CLOCKMODE_PVCLOCK)) return 0; /* Use the static page for the first CPUs, allocate otherwise */ -- 2.7.4
Re: [PATCH] Revert "KVM: x86: Unconditionally enable irqs in guest context"
On Wed, 6 Jan 2021 at 08:51, Sean Christopherson wrote: > > +tglx > > On Tue, Jan 05, 2021, Nitesh Narayan Lal wrote: > > This reverts commit d7a08882a0a4b4e176691331ee3f492996579534. > > > > After the introduction of the patch: > > > > 87fa7f3e9: x86/kvm: Move context tracking where it belongs > > > > since we have moved guest_exit_irqoff closer to the VM-Exit, explicit > > enabling of irqs to process pending interrupts should not be required > > within vcpu_enter_guest anymore. > > Ugh, except that commit completely broke tick-based accounting, on both Intel > and AMD. With guest_exit_irqoff() being called immediately after VM-Exit, any > tick that happens after IRQs are disabled will be accounted to the host. E.g. > on Intel, even an IRQ VM-Exit that has already been acked by the CPU isn't > processed until kvm_x86_ops.handle_exit_irqoff(), well after PF_VCPU has been > cleared. > This issue can be 100% reproduced. https://bugzilla.kernel.org/show_bug.cgi?id=204177 > CONFIG_VIRT_CPU_ACCOUNTING_GEN=y should still work (I didn't bother to > verify). > > Thomas, any clever ideas? Handling IRQs in {vmx,svm}_vcpu_enter_exit() isn't > an > option as KVM hasn't restored enough state to handle an IRQ, e.g. PKRU and > XCR0 > are still guest values. Is it too heinous to fudge PF_VCPU across KVM's > "pending" IRQ handling? E.g. this god-awful hack fixes the accounting: > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 836912b42030..5a777fd35b4b 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -9028,6 +9028,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) > vcpu->mode = OUTSIDE_GUEST_MODE; > smp_wmb(); > > + current->flags |= PF_VCPU; > kvm_x86_ops.handle_exit_irqoff(vcpu); > > /* > @@ -9042,6 +9043,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) > ++vcpu->stat.exits; > local_irq_disable(); > kvm_after_interrupt(vcpu); > + current->flags &= ~PF_VCPU; > > if (lapic_in_kernel(vcpu)) { > s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta; > > > Conflicts: > > arch/x86/kvm/svm.c > > > > Signed-off-by: Nitesh Narayan Lal > > --- > > arch/x86/kvm/svm/svm.c | 9 + > > arch/x86/kvm/x86.c | 11 --- > > 2 files changed, 9 insertions(+), 11 deletions(-) > > > > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c > > index cce0143a6f80..c9b2fbb32484 100644 > > --- a/arch/x86/kvm/svm/svm.c > > +++ b/arch/x86/kvm/svm/svm.c > > @@ -4187,6 +4187,15 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, > > > > static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) > > { > > + kvm_before_interrupt(vcpu); > > + local_irq_enable(); > > + /* > > + * We must have an instruction with interrupts enabled, so > > + * the timer interrupt isn't delayed by the interrupt shadow. > > + */ > > + asm("nop"); > > + local_irq_disable(); > > + kvm_after_interrupt(vcpu); > > } > > > > static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > index 3f7c1fc7a3ce..3e17c9ffcad8 100644 > > --- a/arch/x86/kvm/x86.c > > +++ b/arch/x86/kvm/x86.c > > @@ -9023,18 +9023,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) > > > > kvm_x86_ops.handle_exit_irqoff(vcpu); > > > > - /* > > - * Consume any pending interrupts, including the possible source of > > - * VM-Exit on SVM and any ticks that occur between VM-Exit and now. > > - * An instruction is required after local_irq_enable() to fully > > unblock > > - * interrupts on processors that implement an interrupt shadow, the > > - * stat.exits increment will do nicely. > > - */ > > - kvm_before_interrupt(vcpu); > > - local_irq_enable(); > > ++vcpu->stat.exits; > > - local_irq_disable(); > > - kvm_after_interrupt(vcpu); > > > > if (lapic_in_kernel(vcpu)) { > > s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta; > > -- > > 2.27.0 > >
Re: [PATCH] KVM: kvmclock: Fix vCPUs > 64 can't be online/hotpluged
On Thu, 14 Jan 2021 at 21:45, Vitaly Kuznetsov wrote: > > Wanpeng Li writes: > > > From: Wanpeng Li > > > > The per-cpu vsyscall pvclock data pointer assigns either an element of the > > static array hv_clock_boot (#vCPU <= 64) or dynamically allocated memory > > hvclock_mem (vCPU > 64), the dynamically memory will not be allocated if > > kvmclock vsyscall is disabled, this can result in cpu hotpluged fails in > > kvmclock_setup_percpu() which returns -ENOMEM. This patch fixes it by not > > assigning vsyscall pvclock data pointer if kvmclock vdso_clock_mode is not > > VDSO_CLOCKMODE_PVCLOCK. > > > > Fixes: 6a1cac56f4 ("x86/kvm: Use __bss_decrypted attribute in shared > > variables") > > Reported-by: Zelin Deng > > Tested-by: Haiwei Li > > Cc: Brijesh Singh > > Cc: sta...@vger.kernel.org#v4.19-rc5+ > > Signed-off-by: Wanpeng Li > > --- > > arch/x86/kernel/kvmclock.c | 3 ++- > > 1 file changed, 2 insertions(+), 1 deletion(-) > > > > diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c > > index aa59374..0624290 100644 > > --- a/arch/x86/kernel/kvmclock.c > > +++ b/arch/x86/kernel/kvmclock.c > > @@ -296,7 +296,8 @@ static int kvmclock_setup_percpu(unsigned int cpu) > >* pointers. So carefully check. CPU0 has been set up in init > >* already. > >*/ > > - if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0))) > > + if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0)) || > > + (kvm_clock.vdso_clock_mode != VDSO_CLOCKMODE_PVCLOCK)) > > return 0; > > The comment above should probably be updated as it is not clear why we > check kvm_clock.vdso_clock_mode here. Actually, I would even suggest we > introduce a 'kvmclock_tsc_stable' global instead to avoid this indirect > check. I prefer to update the comment above, assign vsyscall pvclock data pointer iff kvmclock vsyscall is enabled. Wanpeng
[PATCH] KVM: kvmclock: Fix vCPUs > 64 can't be online/hotpluged
From: Wanpeng Li The per-cpu vsyscall pvclock data pointer assigns either an element of the static array hv_clock_boot (#vCPU <= 64) or dynamically allocated memory hvclock_mem (vCPU > 64), the dynamically memory will not be allocated if kvmclock vsyscall is disabled, this can result in cpu hotpluged fails in kvmclock_setup_percpu() which returns -ENOMEM. This patch fixes it by not assigning vsyscall pvclock data pointer if kvmclock vdso_clock_mode is not VDSO_CLOCKMODE_PVCLOCK. Fixes: 6a1cac56f4 ("x86/kvm: Use __bss_decrypted attribute in shared variables") Reported-by: Zelin Deng Tested-by: Haiwei Li Cc: Brijesh Singh Cc: sta...@vger.kernel.org#v4.19-rc5+ Signed-off-by: Wanpeng Li --- arch/x86/kernel/kvmclock.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index aa59374..0624290 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -296,7 +296,8 @@ static int kvmclock_setup_percpu(unsigned int cpu) * pointers. So carefully check. CPU0 has been set up in init * already. */ - if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0))) + if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0)) || + (kvm_clock.vdso_clock_mode != VDSO_CLOCKMODE_PVCLOCK)) return 0; /* Use the static page for the first CPUs, allocate otherwise */ -- 2.7.4
Re: [PATCH] Revert "KVM: x86: Unconditionally enable irqs in guest context"
On Thu, 7 Jan 2021 at 17:35, Vitaly Kuznetsov wrote: > > Sean Christopherson writes: > > > On Wed, Jan 06, 2021, Vitaly Kuznetsov wrote: > >> > >> Looking back, I don't quite understand why we wanted to account ticks > >> between vmexit and exiting guest context as 'guest' in the first place; > >> to my understanging 'guest time' is time spent within VMX non-root > >> operation, the rest is KVM overhead (system). > > > > With tick-based accounting, if the tick IRQ is received after PF_VCPU is > > cleared > > then that tick will be accounted to the host/system. The motivation for > > opening > > an IRQ window after VM-Exit is to handle the case where the guest is > > constantly > > exiting for a different reason _just_ before the tick arrives, e.g. if the > > guest > > has its tick configured such that the guest and host ticks get synchronized > > in a bad way. > > > > This is a non-issue when using CONFIG_VIRT_CPU_ACCOUNTING_GEN=y, at least > > with a > > stable TSC, as the accounting happens during guest_exit_irqoff() itself. > > Accounting might be less-than-stellar if TSC is unstable, but I don't think > > it > > would be as binary of a failure as tick-based accounting. > > > > Oh, yea, I vaguely remember we had to deal with a very similar problem > but for userspace/kernel accounting. It was possible to observe e.g. a > userspace task going 100% kernel while in reality it was just perfectly > synchronized with the tick and doing a syscall just before it arrives > (or something like that, I may be misremembering the details). Yes. :) commit 2a42eb9594a1 ("sched/cputime: Accumulate vtime on top of nsec clocksource") > So depending on the frequency, it is probably possible to e.g observe > '100% host' with tick based accounting, the guest just has to > synchronize exiting to KVM in a way that the tick will always arrive > past guest_exit_irqoff(). > > It seems to me this is a fundamental problem in case the frequency of > guest exits can match the frequency of the time accounting tick. > > -- > Vitaly >
Re: [PATCH] Revert "KVM: x86: Unconditionally enable irqs in guest context"
On Wed, 6 Jan 2021 at 06:30, Nitesh Narayan Lal wrote: > > This reverts commit d7a08882a0a4b4e176691331ee3f492996579534. > > After the introduction of the patch: > > 87fa7f3e9: x86/kvm: Move context tracking where it belongs > > since we have moved guest_exit_irqoff closer to the VM-Exit, explicit > enabling of irqs to process pending interrupts should not be required > within vcpu_enter_guest anymore. > > Conflicts: > arch/x86/kvm/svm.c > > Signed-off-by: Nitesh Narayan Lal > --- > arch/x86/kvm/svm/svm.c | 9 + > arch/x86/kvm/x86.c | 11 --- > 2 files changed, 9 insertions(+), 11 deletions(-) > > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c > index cce0143a6f80..c9b2fbb32484 100644 > --- a/arch/x86/kvm/svm/svm.c > +++ b/arch/x86/kvm/svm/svm.c > @@ -4187,6 +4187,15 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, > > static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) > { > + kvm_before_interrupt(vcpu); > + local_irq_enable(); > + /* > +* We must have an instruction with interrupts enabled, so > +* the timer interrupt isn't delayed by the interrupt shadow. > +*/ > + asm("nop"); > + local_irq_disable(); > + kvm_after_interrupt(vcpu); > } Why do we need to reintroduce this part? Wanpeng
Re: [PATCH] KVM: X86: Expose KVM_HINTS_REALTIME in KVM_GET_SUPPORTED_CPUID
On Thu, 22 Oct 2020 at 21:02, Paolo Bonzini wrote: > > On 22/10/20 03:34, Wanpeng Li wrote: > > From: Wanpeng Li > > > > Per KVM_GET_SUPPORTED_CPUID ioctl documentation: > > > > This ioctl returns x86 cpuid features which are supported by both the > > hardware and kvm in its default configuration. > > > > A well-behaved userspace should not set the bit if it is not supported. > > > > Suggested-by: Jim Mattson > > Signed-off-by: Wanpeng Li > > It's common for userspace to copy all supported CPUID bits to > KVM_SET_CPUID2, I don't think this is the right behavior for > KVM_HINTS_REALTIME. > > (But maybe this was discussed already; if so, please point me to the > previous discussion). The discussion is here. :) https://www.spinics.net/lists/kvm/msg227265.html Wanpeng
[PATCH] KVM: X86: Expose KVM_HINTS_REALTIME in KVM_GET_SUPPORTED_CPUID
From: Wanpeng Li Per KVM_GET_SUPPORTED_CPUID ioctl documentation: This ioctl returns x86 cpuid features which are supported by both the hardware and kvm in its default configuration. A well-behaved userspace should not set the bit if it is not supported. Suggested-by: Jim Mattson Signed-off-by: Wanpeng Li --- arch/x86/kvm/cpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 06a278b..225d251 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -789,7 +789,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->ebx = 0; entry->ecx = 0; - entry->edx = 0; + entry->edx = (1 << KVM_HINTS_REALTIME); break; case 0x8000: entry->eax = min(entry->eax, 0x801f); -- 2.7.4
Re: [RFC V2 0/9] x86/mmu:Introduce parallel memory virtualization to boost performance
Any comments? Paolo! :) On Wed, 9 Sep 2020 at 11:04, Wanpeng Li wrote: > > Any comments? guys! > On Tue, 1 Sep 2020 at 19:52, wrote: > > > > From: Yulei Zhang > > > > Currently in KVM memory virtulization we relay on mmu_lock to > > synchronize the memory mapping update, which make vCPUs work > > in serialize mode and slow down the execution, especially after > > migration to do substantial memory mapping will cause visible > > performance drop, and it can get worse if guest has more vCPU > > numbers and memories. > > > > The idea we present in this patch set is to mitigate the issue > > with pre-constructed memory mapping table. We will fast pin the > > guest memory to build up a global memory mapping table according > > to the guest memslots changes and apply it to cr3, so that after > > guest starts up all the vCPUs would be able to update the memory > > simultaneously without page fault exception, thus the performance > > improvement is expected. > > > > We use memory dirty pattern workload to test the initial patch > > set and get positive result even with huge page enabled. For example, > > we create guest with 32 vCPUs and 64G memories, and let the vcpus > > dirty the entire memory region concurrently, as the initial patch > > eliminate the overhead of mmu_lock, in 2M/1G huge page mode we would > > get the job done in about 50% faster. > > > > We only validate this feature on Intel x86 platform. And as Ben > > pointed out in RFC V1, so far we disable the SMM for resource > > consideration, drop the mmu notification as in this case the > > memory is pinned. > > > > V1->V2: > > * Rebase the code to kernel version 5.9.0-rc1. > > > > Yulei Zhang (9): > > Introduce new fields in kvm_arch/vcpu_arch struct for direct build EPT > > support > > Introduce page table population function for direct build EPT feature > > Introduce page table remove function for direct build EPT feature > > Add release function for direct build ept when guest VM exit > > Modify the page fault path to meet the direct build EPT requirement > > Apply the direct build EPT according to the memory slots change > > Add migration support when using direct build EPT > > Introduce kvm module parameter global_tdp to turn on the direct build > > EPT mode > > Handle certain mmu exposed functions properly while turn on direct > > build EPT mode > > > > arch/mips/kvm/mips.c| 13 + > > arch/powerpc/kvm/powerpc.c | 13 + > > arch/s390/kvm/kvm-s390.c| 13 + > > arch/x86/include/asm/kvm_host.h | 13 +- > > arch/x86/kvm/mmu/mmu.c | 533 ++-- > > arch/x86/kvm/svm/svm.c | 2 +- > > arch/x86/kvm/vmx/vmx.c | 7 +- > > arch/x86/kvm/x86.c | 55 ++-- > > include/linux/kvm_host.h| 7 +- > > virt/kvm/kvm_main.c | 43 ++- > > 10 files changed, 639 insertions(+), 60 deletions(-) > > > > -- > > 2.17.1 > >
Re: [PATCH] KVM: SVM: Add tracepoint for cr_interception
On Fri, 4 Sep 2020 at 19:29, Haiwei Li wrote: > > From: Haiwei Li > > Add trace_kvm_cr_write and trace_kvm_cr_read for svm. > > Signed-off-by: Haiwei Li Reviewed-by: Wanpeng Li
Re: [PATCH] KVM: x86: Add kvm_x86_ops hook to short circuit emulation
On Wed, 16 Sep 2020 at 07:29, Sean Christopherson wrote: > > Replace the existing kvm_x86_ops.need_emulation_on_page_fault() with a > more generic is_emulatable(), and unconditionally call the new function > in x86_emulate_instruction(). > > KVM will use the generic hook to support multiple security related > technologies that prevent emulation in one way or another. Similar to > the existing AMD #NPF case where emulation of the current instruction is > not possible due to lack of information, AMD's SEV-ES and Intel's SGX > and TDX will introduce scenarios where emulation is impossible due to > the guest's register state being inaccessible. And again similar to the > existing #NPF case, emulation can be initiated by kvm_mmu_page_fault(), > i.e. outside of the control of vendor-specific code. > > While the cause and architecturally visible behavior of the various > cases are different, e.g. SGX will inject a #UD, AMD #NPF is a clean > resume or complete shutdown, and SEV-ES and TDX "return" an error, the > impact on the common emulation code is identical: KVM must stop > emulation immediately and resume the guest. > > Query is_emulatable() in handle_ud() as well so that the > force_emulation_prefix code doesn't incorrectly modify RIP before > calling emulate_instruction() in the absurdly unlikely scenario that > KVM encounters forced emulation in conjunction with "do not emulate". > > Cc: Tom Lendacky > Signed-off-by: Sean Christopherson > --- > arch/x86/include/asm/kvm_host.h | 2 +- > arch/x86/kvm/mmu/mmu.c | 12 > arch/x86/kvm/svm/svm.c | 31 ++- > arch/x86/kvm/vmx/vmx.c | 12 ++-- > arch/x86/kvm/x86.c | 9 - > 5 files changed, 33 insertions(+), 33 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index 5303dbc5c9bc..fa89511ed9d6 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -1221,7 +1221,7 @@ struct kvm_x86_ops { > > int (*get_msr_feature)(struct kvm_msr_entry *entry); > > - bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu); > + bool (*is_emulatable)(struct kvm_vcpu *vcpu, void *insn, int > insn_len); > > bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu); > int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu); > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index a5d0207e7189..f818a46db58c 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -5485,18 +5485,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t > cr2_or_gpa, u64 error_code, > if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && > !is_guest_mode(vcpu)) > emulation_type |= EMULTYPE_ALLOW_RETRY_PF; > emulate: > - /* > -* On AMD platforms, under certain conditions insn_len may be zero on > #NPF. > -* This can happen if a guest gets a page-fault on data access but > the HW > -* table walker is not able to read the instruction page (e.g > instruction > -* page is not present in memory). In those cases we simply restart > the > -* guest, with the exception of AMD Erratum 1096 which is > unrecoverable. > -*/ > - if (unlikely(insn && !insn_len)) { > - if (!kvm_x86_ops.need_emulation_on_page_fault(vcpu)) > - return 1; > - } > - > return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, >insn_len); > } > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c > index 03dd7bac8034..3a55495d985f 100644 > --- a/arch/x86/kvm/svm/svm.c > +++ b/arch/x86/kvm/svm/svm.c > @@ -3933,19 +3933,10 @@ static void enable_smi_window(struct kvm_vcpu *vcpu) > } > } > > -static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) > +static bool svm_is_emulatable(struct kvm_vcpu *vcpu, void *insn, int > insn_len) > { > - unsigned long cr4 = kvm_read_cr4(vcpu); > - bool smep = cr4 & X86_CR4_SMEP; > - bool smap = cr4 & X86_CR4_SMAP; > - bool is_user = svm_get_cpl(vcpu) == 3; > - > - /* > -* If RIP is invalid, go ahead with emulation which will cause an > -* internal error exit. > -*/ > - if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT)) > - return true; > + bool smep, smap, is_user; > + unsigned long cr4; > > /* > * Detect and workaround Errata 1096 Fam_17h_00_0Fh. > @@ -3987,6 +3978,20 @@ static bool svm_need_emulation_on_page_fault(struct > kvm_vcpu *vcpu) > * instruction pointer so we will not able to workaround it. Lets > * print the error and request to kill the guest. > */ > + if (likely(!insn || insn_len)) > + return true; > + > + /* > +* If RIP is invalid, go ahead with
Re: [PATCH 2/3] KVM: SVM: Move svm_complete_interrupts() into svm_vcpu_run()
On Sat, 12 Sep 2020 at 14:20, Paolo Bonzini wrote: > > On 09/09/20 10:47, Wanpeng Li wrote: > >> One more thing: > >> > >> VMX version does > >> > >> vmx_complete_interrupts(vmx); > >> if (is_guest_mode(vcpu)) > >> return EXIT_FASTPATH_NONE; > >> > >> and on SVM we analyze is_guest_mode() inside > >> svm_exit_handlers_fastpath() - should we also change that for > >> conformity? > > > > Agreed, will do in v2. > > Please just send an incremental patch. Thanks! Just sent out a patch to do it. :) Wanpeng
[PATCH] KVM: SVM: Analyze is_guest_mode() in svm_vcpu_run()
From: Wanpeng Li Analyze is_guest_mode() in svm_vcpu_run() instead of svm_exit_handlers_fastpath() in conformity with VMX version. Suggested-by: Vitaly Kuznetsov Signed-off-by: Wanpeng Li --- arch/x86/kvm/svm/svm.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 3da5b2f..009035a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3393,8 +3393,7 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { - if (!is_guest_mode(vcpu) && - to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && + if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && to_svm(vcpu)->vmcb->control.exit_info_1) return handle_fastpath_set_msr_irqoff(vcpu); @@ -3580,6 +3579,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) svm_handle_mce(svm); svm_complete_interrupts(svm); + + if (is_guest_mode(vcpu)) + return EXIT_FASTPATH_NONE; + exit_fastpath = svm_exit_handlers_fastpath(vcpu); return exit_fastpath; } -- 2.7.4
[PATCH v2 3/9] KVM: LAPIC: Fix updating DFR missing apic map recalculation
From: Wanpeng Li There is missing apic map recalculation after updating DFR, if it is INIT RESET, in x2apic mode, local apic is software enabled before. This patch fix it by introducing the function kvm_apic_set_dfr() to be called in INIT RESET handling path. Signed-off-by: Wanpeng Li --- arch/x86/kvm/lapic.c | 15 ++- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 33aab20..e446bdf 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -310,6 +310,12 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id) atomic_set_release(>vcpu->kvm->arch.apic_map_dirty, DIRTY); } +static inline void kvm_apic_set_dfr(struct kvm_lapic *apic, u32 val) +{ + kvm_lapic_set_reg(apic, APIC_DFR, val); + atomic_set_release(>vcpu->kvm->arch.apic_map_dirty, DIRTY); +} + static inline u32 kvm_apic_calc_x2apic_ldr(u32 id) { return ((id >> 4) << 16) | (1 << (id & 0xf)); @@ -1984,10 +1990,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) break; case APIC_DFR: - if (!apic_x2apic_mode(apic)) { - kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFF); - atomic_set_release(>vcpu->kvm->arch.apic_map_dirty, DIRTY); - } else + if (!apic_x2apic_mode(apic)) + kvm_apic_set_dfr(apic, val | 0x0FFF); + else ret = 1; break; @@ -2301,7 +2306,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); - kvm_lapic_set_reg(apic, APIC_DFR, 0xU); + kvm_apic_set_dfr(apic, 0xU); apic_set_spiv(apic, 0xff); kvm_lapic_set_reg(apic, APIC_TASKPRI, 0); if (!apic_x2apic_mode(apic)) -- 2.7.4
[PATCH v2 4/9] KVM: VMX: Don't freeze guest when event delivery causes an APIC-access exit
From: Wanpeng Li According to SDM 27.2.4, Event delivery causes an APIC-access VM exit. Don't report internal error and freeze guest when event delivery causes an APIC-access exit, it is handleable and the event will be re-injected during the next vmentry. Signed-off-by: Wanpeng Li --- arch/x86/kvm/vmx/vmx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 819c185..a544351 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6054,6 +6054,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) (exit_reason != EXIT_REASON_EXCEPTION_NMI && exit_reason != EXIT_REASON_EPT_VIOLATION && exit_reason != EXIT_REASON_PML_FULL && + exit_reason != EXIT_REASON_APIC_ACCESS && exit_reason != EXIT_REASON_TASK_SWITCH)) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; -- 2.7.4
[PATCH v2 8/9] KVM: SVM: Move svm_complete_interrupts() into svm_vcpu_run()
From: Wanpeng Li Moving svm_complete_interrupts() into svm_vcpu_run() which can align VMX and SVM with respect to completing interrupts. Suggested-by: Sean Christopherson Reviewed-by: Vitaly Kuznetsov Cc: Paul K. Signed-off-by: Wanpeng Li --- arch/x86/kvm/svm/svm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index c61bc3b..dafc14d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2938,8 +2938,6 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) if (npt_enabled) vcpu->arch.cr3 = svm->vmcb->save.cr3; - svm_complete_interrupts(svm); - if (is_guest_mode(vcpu)) { int vmexit; @@ -3530,6 +3528,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) SVM_EXIT_EXCP_BASE + MC_VECTOR)) svm_handle_mce(svm); + svm_complete_interrupts(svm); vmcb_mark_all_clean(svm->vmcb); return exit_fastpath; } -- 2.7.4
[PATCH v2 9/9] KVM: SVM: Reenable handle_fastpath_set_msr_irqoff() after complete_interrupts()
From: Wanpeng Li Moving the call to svm_exit_handlers_fastpath() after svm_complete_interrupts() since svm_complete_interrupts() consumes rip and reenable the function handle_fastpath_set_msr_irqoff() call in svm_exit_handlers_fastpath(). Suggested-by: Sean Christopherson Reviewed-by: Vitaly Kuznetsov Cc: Paul K. Signed-off-by: Wanpeng Li --- arch/x86/kvm/svm/svm.c | 11 ++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index dafc14d..b3e3429 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3347,6 +3347,10 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { + if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && + to_svm(vcpu)->vmcb->control.exit_info_1) + return handle_fastpath_set_msr_irqoff(vcpu); + return EXIT_FASTPATH_NONE; } @@ -3495,7 +3499,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) stgi(); /* Any pending NMI will happen here */ - exit_fastpath = svm_exit_handlers_fastpath(vcpu); if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_after_interrupt(>vcpu); @@ -3530,6 +3533,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) svm_complete_interrupts(svm); vmcb_mark_all_clean(svm->vmcb); + + if (is_guest_mode(vcpu)) + return EXIT_FASTPATH_NONE; + + exit_fastpath = svm_exit_handlers_fastpath(vcpu); + return exit_fastpath; } -- 2.7.4
[PATCH v2 5/9] KVM: LAPIC: Narrow down the kick target vCPU
From: Wanpeng Li The kick after setting KVM_REQ_PENDING_TIMER is used to handle the timer fires on a different pCPU which vCPU is running on, this kick is expensive since memory barrier, rcu, preemption disable/enable operations. We don't need this kick when injecting already-expired timer, we also should call out the VMX preemption timer case, which also passes from_timer_fn=false but doesn't need a kick because kvm_lapic_expired_hv_timer() is called from the target vCPU. Reviewed-by: Sean Christopherson Signed-off-by: Wanpeng Li --- arch/x86/kvm/lapic.c | 4 +++- arch/x86/kvm/x86.c | 6 -- arch/x86/kvm/x86.h | 1 - 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index e446bdf..3b32d3b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1642,7 +1642,9 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn) } atomic_inc(>lapic_timer.pending); - kvm_set_pending_timer(vcpu); + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + if (from_timer_fn) + kvm_vcpu_kick(vcpu); } static void start_sw_tscdeadline(struct kvm_lapic *apic) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d39d6cf..dcf4494 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1774,12 +1774,6 @@ static s64 get_kvmclock_base_ns(void) } #endif -void kvm_set_pending_timer(struct kvm_vcpu *vcpu) -{ - kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); - kvm_vcpu_kick(vcpu); -} - static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { int version; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 995ab69..ea20b8b 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -246,7 +246,6 @@ static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu) return is_smm(vcpu) || kvm_x86_ops.apic_init_signal_blocked(vcpu); } -void kvm_set_pending_timer(struct kvm_vcpu *vcpu); void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); -- 2.7.4
[PATCH v2 7/9] KVM: SVM: Get rid of handle_fastpath_set_msr_irqoff()
From: Wanpeng Li Analysis from Sean: | svm->next_rip is reset in svm_vcpu_run() only after calling | svm_exit_handlers_fastpath(), which will cause SVM's | skip_emulated_instruction() to write a stale RIP. Let's get rid of handle_fastpath_set_msr_irqoff() in svm_exit_handlers_fastpath() to have a quick fix. Reported-by: Paul K. Suggested-by: Sean Christopherson Reviewed-by: Vitaly Kuznetsov Cc: Paul K. Cc: # v5.8-rc1+ Fixes: 404d5d7bff0d (KVM: X86: Introduce more exit_fastpath_completion enum values) Signed-off-by: Wanpeng Li --- arch/x86/kvm/svm/svm.c | 5 - 1 file changed, 5 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 19e622a..c61bc3b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3349,11 +3349,6 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { - if (!is_guest_mode(vcpu) && - to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && - to_svm(vcpu)->vmcb->control.exit_info_1) - return handle_fastpath_set_msr_irqoff(vcpu); - return EXIT_FASTPATH_NONE; } -- 2.7.4
[PATCH v2 6/9] KVM: LAPIC: Reduce world switch latency caused by timer_advance_ns
From: Wanpeng Li All the checks in lapic_timer_int_injected(), __kvm_wait_lapic_expire(), and these function calls waste cpu cycles when the timer mode is not tscdeadline. We can observe ~1.3% world switch time overhead by kvm-unit-tests/vmexit.flat vmcall testing on AMD server. This patch reduces the world switch latency caused by timer_advance_ns feature when the timer mode is not tscdeadline by simpling move the check against apic->lapic_timer.expired_tscdeadline much earlier. Signed-off-by: Wanpeng Li --- arch/x86/kvm/lapic.c | 11 +-- arch/x86/kvm/svm/svm.c | 4 +--- arch/x86/kvm/vmx/vmx.c | 4 +--- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3b32d3b..51ed4f0 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1582,9 +1582,6 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; u64 guest_tsc, tsc_deadline; - if (apic->lapic_timer.expired_tscdeadline == 0) - return; - tsc_deadline = apic->lapic_timer.expired_tscdeadline; apic->lapic_timer.expired_tscdeadline = 0; guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); @@ -1599,7 +1596,10 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) { - if (lapic_timer_int_injected(vcpu)) + if (lapic_in_kernel(vcpu) && + vcpu->arch.apic->lapic_timer.expired_tscdeadline && + vcpu->arch.apic->lapic_timer.timer_advance_ns && + lapic_timer_int_injected(vcpu)) __kvm_wait_lapic_expire(vcpu); } EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire); @@ -1635,8 +1635,7 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn) } if (kvm_use_posted_timer_interrupt(apic->vcpu)) { - if (apic->lapic_timer.timer_advance_ns) - __kvm_wait_lapic_expire(vcpu); + kvm_wait_lapic_expire(vcpu); kvm_apic_inject_pending_timer_irqs(apic); return; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 0194336..19e622a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3456,9 +3456,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) clgi(); kvm_load_guest_xsave_state(vcpu); - if (lapic_in_kernel(vcpu) && - vcpu->arch.apic->lapic_timer.timer_advance_ns) - kvm_wait_lapic_expire(vcpu); + kvm_wait_lapic_expire(vcpu); /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a544351..d6e1656 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6800,9 +6800,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) if (enable_preemption_timer) vmx_update_hv_timer(vcpu); - if (lapic_in_kernel(vcpu) && - vcpu->arch.apic->lapic_timer.timer_advance_ns) - kvm_wait_lapic_expire(vcpu); + kvm_wait_lapic_expire(vcpu); /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if -- 2.7.4
[PATCH v2 2/9] KVM: LAPIC: Guarantee the timer is in tsc-deadline mode when setting
From: Wanpeng Li Check apic_lvtt_tscdeadline() mode directly instead of apic_lvtt_oneshot() and apic_lvtt_period() to guarantee the timer is in tsc-deadline mode when wrmsr MSR_IA32_TSCDEADLINE. Reviewed-by: Sean Christopherson Signed-off-by: Wanpeng Li --- arch/x86/kvm/lapic.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 81bf6a8..33aab20 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2193,8 +2193,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!kvm_apic_present(vcpu) || apic_lvtt_oneshot(apic) || - apic_lvtt_period(apic)) + if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic)) return; hrtimer_cancel(>lapic_timer.timer); -- 2.7.4
[PATCH v2 1/9] KVM: LAPIC: Return 0 when getting the tscdeadline timer if the lapic is hw disabled
From: Wanpeng Li Return 0 when getting the tscdeadline timer if the lapic is hw disabled. Suggested-by: Paolo Bonzini Reviewed-by: Sean Christopherson Signed-off-by: Wanpeng Li --- arch/x86/kvm/lapic.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 35cca2e..81bf6a8 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2183,8 +2183,7 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!lapic_in_kernel(vcpu) || - !apic_lvtt_tscdeadline(apic)) + if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic)) return 0; return apic->lapic_timer.tscdeadline; -- 2.7.4
[PATCH v2 0/9] KVM: collect sporadic patches
Collect sporadic patches for easy apply. Wanpeng Li (9): KVM: LAPIC: Return 0 when getting the tscdeadline timer if the lapic is hw disabled KVM: LAPIC: Guarantee the timer is in tsc-deadline mode when setting KVM: LAPIC: Fix updating DFR missing apic map recalculation KVM: VMX: Don't freeze guest when event delivery causes an APIC-access exit KVM: LAPIC: Narrow down the kick target vCPU KVM: LAPIC: Reduce world switch latency caused by timer_advance_ns KVM: SVM: Get rid of handle_fastpath_set_msr_irqoff() KVM: SVM: Move svm_complete_interrupts() into svm_vcpu_run() KVM: SVM: Reenable handle_fastpath_set_msr_irqoff() after complete_interrupts() arch/x86/kvm/lapic.c | 36 arch/x86/kvm/svm/svm.c | 17 + arch/x86/kvm/vmx/vmx.c | 5 ++--- arch/x86/kvm/x86.c | 6 -- arch/x86/kvm/x86.h | 1 - 5 files changed, 31 insertions(+), 34 deletions(-) -- 2.7.4
Re: [PATCH 2/3] KVM: SVM: Move svm_complete_interrupts() into svm_vcpu_run()
On Wed, 9 Sep 2020 at 16:36, Vitaly Kuznetsov wrote: > > Wanpeng Li writes: > > > From: Wanpeng Li > > > > Moving svm_complete_interrupts() into svm_vcpu_run() which can align VMX > > and SVM with respect to completing interrupts. > > > > Suggested-by: Sean Christopherson > > Cc: Paul K. > > Signed-off-by: Wanpeng Li > > --- > > arch/x86/kvm/svm/svm.c | 4 ++-- > > 1 file changed, 2 insertions(+), 2 deletions(-) > > > > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c > > index c61bc3b..74bcf0a 100644 > > --- a/arch/x86/kvm/svm/svm.c > > +++ b/arch/x86/kvm/svm/svm.c > > @@ -2938,8 +2938,6 @@ static int handle_exit(struct kvm_vcpu *vcpu, > > fastpath_t exit_fastpath) > > if (npt_enabled) > > vcpu->arch.cr3 = svm->vmcb->save.cr3; > > > > - svm_complete_interrupts(svm); > > - > > if (is_guest_mode(vcpu)) { > > int vmexit; > > > > @@ -3530,6 +3528,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct > > kvm_vcpu *vcpu) > >SVM_EXIT_EXCP_BASE + MC_VECTOR)) > > svm_handle_mce(svm); > > > > + svm_complete_interrupts(svm); > > + > > vmcb_mark_all_clean(svm->vmcb); > > return exit_fastpath; > > } > > This seems to be the right thing to do, however, the amount of code > between kvm_x86_ops.run() and kvm_x86_ops.handle_exit() is non-trivial, > hope it won't blow up in testing... > > Reviewed-by: Vitaly Kuznetsov > > One more thing: > > VMX version does > > vmx_complete_interrupts(vmx); > if (is_guest_mode(vcpu)) > return EXIT_FASTPATH_NONE; > > and on SVM we analyze is_guest_mode() inside > svm_exit_handlers_fastpath() - should we also change that for > conformity? Agreed, will do in v2. Wanpeng
Re: [PATCH 1/2] KVM: LAPIC: Fix updating DFR missing apic map recalculation
Any Reviewed-by for these two patches? :) On Wed, 19 Aug 2020 at 16:55, Wanpeng Li wrote: > > From: Wanpeng Li > > There is missing apic map recalculation after updating DFR, if it is > INIT RESET, in x2apic mode, local apic is software enabled before. > This patch fix it by introducing the function kvm_apic_set_dfr() to > be called in INIT RESET handling path. > > Signed-off-by: Wanpeng Li > --- > arch/x86/kvm/lapic.c | 15 ++- > 1 file changed, 10 insertions(+), 5 deletions(-) > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c > index 5ccbee7..248095a 100644 > --- a/arch/x86/kvm/lapic.c > +++ b/arch/x86/kvm/lapic.c > @@ -310,6 +310,12 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic > *apic, u32 id) > atomic_set_release(>vcpu->kvm->arch.apic_map_dirty, DIRTY); > } > > +static inline void kvm_apic_set_dfr(struct kvm_lapic *apic, u32 val) > +{ > + kvm_lapic_set_reg(apic, APIC_DFR, val); > + atomic_set_release(>vcpu->kvm->arch.apic_map_dirty, DIRTY); > +} > + > static inline u32 kvm_apic_calc_x2apic_ldr(u32 id) > { > return ((id >> 4) << 16) | (1 << (id & 0xf)); > @@ -1984,10 +1990,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 > reg, u32 val) > break; > > case APIC_DFR: > - if (!apic_x2apic_mode(apic)) { > - kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFF); > - > atomic_set_release(>vcpu->kvm->arch.apic_map_dirty, DIRTY); > - } else > + if (!apic_x2apic_mode(apic)) > + kvm_apic_set_dfr(apic, val | 0x0FFF); > + else > ret = 1; > break; > > @@ -2303,7 +2308,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool > init_event) > SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); > apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); > > - kvm_lapic_set_reg(apic, APIC_DFR, 0xU); > + kvm_apic_set_dfr(apic, 0xU); > apic_set_spiv(apic, 0xff); > kvm_lapic_set_reg(apic, APIC_TASKPRI, 0); > if (!apic_x2apic_mode(apic)) > -- > 2.7.4 >
Re: [PATCH 1/3] KVM: SVM: Get rid of handle_fastpath_set_msr_irqoff()
On Wed, 9 Sep 2020 at 16:23, Vitaly Kuznetsov wrote: > > Wanpeng Li writes: > > > From: Wanpeng Li > > > > Analysis from Sean: > > > > | svm->next_rip is reset in svm_vcpu_run() only after calling > > | svm_exit_handlers_fastpath(), which will cause SVM's > > | skip_emulated_instruction() to write a stale RIP. > > > > This should only happen when svm->vmcb->control.next_rip is not set by > hardware as skip_emulated_instruction() itself sets 'svm->next_rip' > otherwise, right? The bug is reported here https://bugzilla.kernel.org/show_bug.cgi?id=209155 , the old machine which the reporter uses doesn't have NRIP save on #VMEXIT support. :) Wanpeng
Re: [RFC V2 0/9] x86/mmu:Introduce parallel memory virtualization to boost performance
Any comments? guys! On Tue, 1 Sep 2020 at 19:52, wrote: > > From: Yulei Zhang > > Currently in KVM memory virtulization we relay on mmu_lock to > synchronize the memory mapping update, which make vCPUs work > in serialize mode and slow down the execution, especially after > migration to do substantial memory mapping will cause visible > performance drop, and it can get worse if guest has more vCPU > numbers and memories. > > The idea we present in this patch set is to mitigate the issue > with pre-constructed memory mapping table. We will fast pin the > guest memory to build up a global memory mapping table according > to the guest memslots changes and apply it to cr3, so that after > guest starts up all the vCPUs would be able to update the memory > simultaneously without page fault exception, thus the performance > improvement is expected. > > We use memory dirty pattern workload to test the initial patch > set and get positive result even with huge page enabled. For example, > we create guest with 32 vCPUs and 64G memories, and let the vcpus > dirty the entire memory region concurrently, as the initial patch > eliminate the overhead of mmu_lock, in 2M/1G huge page mode we would > get the job done in about 50% faster. > > We only validate this feature on Intel x86 platform. And as Ben > pointed out in RFC V1, so far we disable the SMM for resource > consideration, drop the mmu notification as in this case the > memory is pinned. > > V1->V2: > * Rebase the code to kernel version 5.9.0-rc1. > > Yulei Zhang (9): > Introduce new fields in kvm_arch/vcpu_arch struct for direct build EPT > support > Introduce page table population function for direct build EPT feature > Introduce page table remove function for direct build EPT feature > Add release function for direct build ept when guest VM exit > Modify the page fault path to meet the direct build EPT requirement > Apply the direct build EPT according to the memory slots change > Add migration support when using direct build EPT > Introduce kvm module parameter global_tdp to turn on the direct build > EPT mode > Handle certain mmu exposed functions properly while turn on direct > build EPT mode > > arch/mips/kvm/mips.c| 13 + > arch/powerpc/kvm/powerpc.c | 13 + > arch/s390/kvm/kvm-s390.c| 13 + > arch/x86/include/asm/kvm_host.h | 13 +- > arch/x86/kvm/mmu/mmu.c | 533 ++-- > arch/x86/kvm/svm/svm.c | 2 +- > arch/x86/kvm/vmx/vmx.c | 7 +- > arch/x86/kvm/x86.c | 55 ++-- > include/linux/kvm_host.h| 7 +- > virt/kvm/kvm_main.c | 43 ++- > 10 files changed, 639 insertions(+), 60 deletions(-) > > -- > 2.17.1 >
[PATCH RESEND 3/3] KVM: SVM: Reenable handle_fastpath_set_msr_irqoff() after complete_interrupts()
From: Wanpeng Li Moving the call to svm_exit_handlers_fastpath() after svm_complete_interrupts() since svm_complete_interrupts() consumes rip and reenable the function handle_fastpath_set_msr_irqoff() call in svm_exit_handlers_fastpath(). Suggested-by: Sean Christopherson Cc: Paul K. Signed-off-by: Wanpeng Li --- arch/x86/kvm/svm/svm.c | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 74bcf0a..ac819f0 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3347,6 +3347,11 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { + if (!is_guest_mode(vcpu) && + to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && + to_svm(vcpu)->vmcb->control.exit_info_1) + return handle_fastpath_set_msr_irqoff(vcpu); + return EXIT_FASTPATH_NONE; } @@ -3495,7 +3500,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) stgi(); /* Any pending NMI will happen here */ - exit_fastpath = svm_exit_handlers_fastpath(vcpu); if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_after_interrupt(>vcpu); @@ -3529,6 +3533,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) svm_handle_mce(svm); svm_complete_interrupts(svm); + exit_fastpath = svm_exit_handlers_fastpath(vcpu); vmcb_mark_all_clean(svm->vmcb); return exit_fastpath; -- 2.7.4
[PATCH RESEND 2/3] KVM: SVM: Move svm_complete_interrupts() into svm_vcpu_run()
From: Wanpeng Li Moving svm_complete_interrupts() into svm_vcpu_run() which can align VMX and SVM with respect to completing interrupts. Suggested-by: Sean Christopherson Cc: Paul K. Signed-off-by: Wanpeng Li --- arch/x86/kvm/svm/svm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index c61bc3b..74bcf0a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2938,8 +2938,6 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) if (npt_enabled) vcpu->arch.cr3 = svm->vmcb->save.cr3; - svm_complete_interrupts(svm); - if (is_guest_mode(vcpu)) { int vmexit; @@ -3530,6 +3528,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) SVM_EXIT_EXCP_BASE + MC_VECTOR)) svm_handle_mce(svm); + svm_complete_interrupts(svm); + vmcb_mark_all_clean(svm->vmcb); return exit_fastpath; } -- 2.7.4
[PATCH RESEND 1/3] KVM: SVM: Get rid of handle_fastpath_set_msr_irqoff()
From: Wanpeng Li Analysis from Sean: | svm->next_rip is reset in svm_vcpu_run() only after calling | svm_exit_handlers_fastpath(), which will cause SVM's | skip_emulated_instruction() to write a stale RIP. Let's get rid of handle_fastpath_set_msr_irqoff() in svm_exit_handlers_fastpath() to have a quick fix. Reported-by: Paul K. Suggested-by: Sean Christopherson Cc: Paul K. Cc: # v5.8-rc1+ Fixes: 404d5d7bff0d (KVM: X86: Introduce more exit_fastpath_completion enum values) Signed-off-by: Wanpeng Li --- arch/x86/kvm/svm/svm.c | 5 - 1 file changed, 5 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 19e622a..c61bc3b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3349,11 +3349,6 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { - if (!is_guest_mode(vcpu) && - to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && - to_svm(vcpu)->vmcb->control.exit_info_1) - return handle_fastpath_set_msr_irqoff(vcpu); - return EXIT_FASTPATH_NONE; } -- 2.7.4
[PATCH 2/3] KVM: SVM: Move svm_complete_interrupts() into svm_vcpu_run()
From: Wanpeng Li Moving svm_complete_interrupts() into svm_vcpu_run() which can align VMX and SVM with respect to completing interrupts. Suggested-by: Sean Christopherson Cc: Paul K. Signed-off-by: Wanpeng Li --- arch/x86/kvm/svm/svm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index c61bc3b..74bcf0a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2938,8 +2938,6 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) if (npt_enabled) vcpu->arch.cr3 = svm->vmcb->save.cr3; - svm_complete_interrupts(svm); - if (is_guest_mode(vcpu)) { int vmexit; @@ -3530,6 +3528,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) SVM_EXIT_EXCP_BASE + MC_VECTOR)) svm_handle_mce(svm); + svm_complete_interrupts(svm); + vmcb_mark_all_clean(svm->vmcb); return exit_fastpath; } -- 2.7.4
[PATCH 3/3] KVM: SVM: Reenable handle_fastpath_set_msr_irqoff() after complete_interrupts()
From: Wanpeng Li Moving the call to svm_exit_handlers_fastpath() after svm_complete_interrupts() since svm_complete_interrupts() consumes rip and reenable the function handle_fastpath_set_msr_irqoff() call in svm_exit_handlers_fastpath(). Suggested-by: Sean Christopherson Cc: Paul K. Signed-off-by: Wanpeng Li --- arch/x86/kvm/svm/svm.c | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 74bcf0a..ac819f0 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3347,6 +3347,11 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { + if (!is_guest_mode(vcpu) && + to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && + to_svm(vcpu)->vmcb->control.exit_info_1) + return handle_fastpath_set_msr_irqoff(vcpu); + return EXIT_FASTPATH_NONE; } @@ -3495,7 +3500,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) stgi(); /* Any pending NMI will happen here */ - exit_fastpath = svm_exit_handlers_fastpath(vcpu); if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_after_interrupt(>vcpu); @@ -3529,6 +3533,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) svm_handle_mce(svm); svm_complete_interrupts(svm); + exit_fastpath = svm_exit_handlers_fastpath(vcpu); vmcb_mark_all_clean(svm->vmcb); return exit_fastpath; -- 2.7.4
[PATCH 1/3] KVM: SVM: Get rid of handle_fastpath_set_msr_irqoff()
From: Wanpeng Li Analysis from Sean: | svm->next_rip is reset in svm_vcpu_run() only after calling | svm_exit_handlers_fastpath(), which will cause SVM's | skip_emulated_instruction() to write a stale RIP. Let's get rid of handle_fastpath_set_msr_irqoff() in svm_exit_handlers_fastpath() to have a quick fix. Reported-by: Paul K. Suggested-by: Sean Christopherson Cc: Paul K. Cc: # v5.8-rc1+ Fixes: 404d5d7bff0d (KVM: X86: Introduce more exit_fastpath_completion enum values) Signed-off-by: Wanpeng Li --- arch/x86/kvm/svm/svm.c | 5 - 1 file changed, 5 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 19e622a..c61bc3b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3349,11 +3349,6 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { - if (!is_guest_mode(vcpu) && - to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && - to_svm(vcpu)->vmcb->control.exit_info_1) - return handle_fastpath_set_msr_irqoff(vcpu); - return EXIT_FASTPATH_NONE; } -- 2.7.4
[PATCH v2] KVM: LAPIC: Reduce world switch latency caused by timer_advance_ns
From: Wanpeng Li All the checks in lapic_timer_int_injected(), __kvm_wait_lapic_expire(), and these function calls waste cpu cycles when the timer mode is not tscdeadline. We can observe ~1.3% world switch time overhead by kvm-unit-tests/vmexit.flat vmcall testing on AMD server. This patch reduces the world switch latency caused by timer_advance_ns feature when the timer mode is not tscdeadline by simpling move the check against apic->lapic_timer.expired_tscdeadline much earlier. Signed-off-by: Wanpeng Li --- v1 -> v2: * move the check against apic->lapic_timer.expired_tscdeadline much earlier instead of reset timer_advance_ns arch/x86/kvm/lapic.c | 11 +-- arch/x86/kvm/svm/svm.c | 4 +--- arch/x86/kvm/vmx/vmx.c | 4 +--- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3b32d3b..51ed4f0 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1582,9 +1582,6 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; u64 guest_tsc, tsc_deadline; - if (apic->lapic_timer.expired_tscdeadline == 0) - return; - tsc_deadline = apic->lapic_timer.expired_tscdeadline; apic->lapic_timer.expired_tscdeadline = 0; guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); @@ -1599,7 +1596,10 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) { - if (lapic_timer_int_injected(vcpu)) + if (lapic_in_kernel(vcpu) && + vcpu->arch.apic->lapic_timer.expired_tscdeadline && + vcpu->arch.apic->lapic_timer.timer_advance_ns && + lapic_timer_int_injected(vcpu)) __kvm_wait_lapic_expire(vcpu); } EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire); @@ -1635,8 +1635,7 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn) } if (kvm_use_posted_timer_interrupt(apic->vcpu)) { - if (apic->lapic_timer.timer_advance_ns) - __kvm_wait_lapic_expire(vcpu); + kvm_wait_lapic_expire(vcpu); kvm_apic_inject_pending_timer_irqs(apic); return; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 0194336..19e622a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3456,9 +3456,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) clgi(); kvm_load_guest_xsave_state(vcpu); - if (lapic_in_kernel(vcpu) && - vcpu->arch.apic->lapic_timer.timer_advance_ns) - kvm_wait_lapic_expire(vcpu); + kvm_wait_lapic_expire(vcpu); /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a544351..d6e1656 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6800,9 +6800,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) if (enable_preemption_timer) vmx_update_hv_timer(vcpu); - if (lapic_in_kernel(vcpu) && - vcpu->arch.apic->lapic_timer.timer_advance_ns) - kvm_wait_lapic_expire(vcpu); + kvm_wait_lapic_expire(vcpu); /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if -- 2.7.4
Re: [PATCH] KVM: LAPIC: Reset timer_advance_ns if timer mode switch
On Thu, 3 Sep 2020 at 05:23, Sean Christopherson wrote: > > On Fri, Aug 28, 2020 at 09:35:08AM +0800, Wanpeng Li wrote: > > From: Wanpeng Li > > > > per-vCPU timer_advance_ns should be set to 0 if timer mode is not > > tscdeadline > > otherwise we waste cpu cycles in the function lapic_timer_int_injected(), > > especially on AMD platform which doesn't support tscdeadline mode. We can > > reset timer_advance_ns to the initial value if switch back to tscdealine > > timer mode. > > > > Signed-off-by: Wanpeng Li > > --- > > arch/x86/kvm/lapic.c | 6 ++ > > 1 file changed, 6 insertions(+) > > > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c > > index 654649b..abc296d 100644 > > --- a/arch/x86/kvm/lapic.c > > +++ b/arch/x86/kvm/lapic.c > > @@ -1499,10 +1499,16 @@ static void apic_update_lvtt(struct kvm_lapic *apic) > > kvm_lapic_set_reg(apic, APIC_TMICT, 0); > > apic->lapic_timer.period = 0; > > apic->lapic_timer.tscdeadline = 0; > > + if (timer_mode == APIC_LVT_TIMER_TSCDEADLINE && > > + lapic_timer_advance_dynamic) > > Bad indentation. > > > + apic->lapic_timer.timer_advance_ns = > > LAPIC_TIMER_ADVANCE_NS_INIT; > > Redoing the tuning seems odd. Doubt it will matter, but it feels weird to > have to retune the advancement just because the guest toggled between modes. > > Rather than clear timer_advance_ns, can we simply move the check against > apic->lapic_timer.expired_tscdeadline much earlier? I think that would > solve this performance hiccup, and IMO would be a logical change in any > case. E.g. with some refactoring to avoid more duplication between VMX and > SVM How about something like below: diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3b32d3b..51ed4f0 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1582,9 +1582,6 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; u64 guest_tsc, tsc_deadline; -if (apic->lapic_timer.expired_tscdeadline == 0) -return; - tsc_deadline = apic->lapic_timer.expired_tscdeadline; apic->lapic_timer.expired_tscdeadline = 0; guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); @@ -1599,7 +1596,10 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) { -if (lapic_timer_int_injected(vcpu)) +if (lapic_in_kernel(vcpu) && +vcpu->arch.apic->lapic_timer.expired_tscdeadline && +vcpu->arch.apic->lapic_timer.timer_advance_ns && +lapic_timer_int_injected(vcpu)) __kvm_wait_lapic_expire(vcpu); } EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire); @@ -1635,8 +1635,7 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn) } if (kvm_use_posted_timer_interrupt(apic->vcpu)) { -if (apic->lapic_timer.timer_advance_ns) -__kvm_wait_lapic_expire(vcpu); +kvm_wait_lapic_expire(vcpu); kvm_apic_inject_pending_timer_irqs(apic); return; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 0194336..19e622a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3456,9 +3456,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) clgi(); kvm_load_guest_xsave_state(vcpu); -if (lapic_in_kernel(vcpu) && -vcpu->arch.apic->lapic_timer.timer_advance_ns) -kvm_wait_lapic_expire(vcpu); +kvm_wait_lapic_expire(vcpu); /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a544351..d6e1656 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6800,9 +6800,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) if (enable_preemption_timer) vmx_update_hv_timer(vcpu); -if (lapic_in_kernel(vcpu) && -vcpu->arch.apic->lapic_timer.timer_advance_ns) -kvm_wait_lapic_expire(vcpu); +kvm_wait_lapic_expire(vcpu); /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if
Re: [PATCH] KVM: LAPIC: Reset timer_advance_ns if timer mode switch
On Mon, 31 Aug 2020 at 20:48, Vitaly Kuznetsov wrote: > > Wanpeng Li writes: > > > From: Wanpeng Li > > > > per-vCPU timer_advance_ns should be set to 0 if timer mode is not > > tscdeadline > > otherwise we waste cpu cycles in the function lapic_timer_int_injected(), > > lapic_timer_int_injected is just a test, kvm_wait_lapic_expire() > (__kvm_wait_lapic_expire()) maybe? Both the check in lapic_timer_int_injected(), the check in __kvm_wait_lapic_expire(), and these function calls, we can observe ~1.3% world switch time reduce w/ this patch by kvm-unit-tests/vmexit.flat vmcall testing on AMD server. In addition, I think we should set apic->lapic_timer.expired_tscdeadline to 0 when switching between tscdeadline mode and other modes on Intel in order that we will not waste cpu cycles to tune advance value in adjust_lapic_timer_advance() for one time. Wanpeng
[PATCH] KVM: LAPIC: Reset timer_advance_ns if timer mode switch
From: Wanpeng Li per-vCPU timer_advance_ns should be set to 0 if timer mode is not tscdeadline otherwise we waste cpu cycles in the function lapic_timer_int_injected(), especially on AMD platform which doesn't support tscdeadline mode. We can reset timer_advance_ns to the initial value if switch back to tscdealine timer mode. Signed-off-by: Wanpeng Li --- arch/x86/kvm/lapic.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 654649b..abc296d 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1499,10 +1499,16 @@ static void apic_update_lvtt(struct kvm_lapic *apic) kvm_lapic_set_reg(apic, APIC_TMICT, 0); apic->lapic_timer.period = 0; apic->lapic_timer.tscdeadline = 0; + if (timer_mode == APIC_LVT_TIMER_TSCDEADLINE && + lapic_timer_advance_dynamic) + apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT; } apic->lapic_timer.timer_mode = timer_mode; limit_periodic_timer_frequency(apic); } + if (timer_mode != APIC_LVT_TIMER_TSCDEADLINE && + lapic_timer_advance_dynamic) + apic->lapic_timer.timer_advance_ns = 0; } /* -- 2.7.4
Re: [PATCH v2] KVM: LAPIC: Narrow down the kick target vCPU
On Mon, 24 Aug 2020 at 09:03, Wanpeng Li wrote: > > From: Wanpeng Li > > The kick after setting KVM_REQ_PENDING_TIMER is used to handle the timer > fires on a different pCPU which vCPU is running on, this kick is expensive > since memory barrier, rcu, preemption disable/enable operations. We don't > need this kick when injecting already-expired timer, we also should call > out the VMX preemption timer case, which also passes from_timer_fn=false > but doesn't need a kick because kvm_lapic_expired_hv_timer() is called > from the target vCPU. > I miss Sean's reviewed-by tag. Reviewed-by: Sean Christopherson > Signed-off-by: Wanpeng Li > --- > v1 -> v2: > * update patch subject and changelog > * open code kvm_set_pending_timer() > > arch/x86/kvm/lapic.c | 4 +++- > arch/x86/kvm/x86.c | 6 -- > arch/x86/kvm/x86.h | 1 - > 3 files changed, 3 insertions(+), 8 deletions(-) > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c > index 248095a..97f1dbf 100644 > --- a/arch/x86/kvm/lapic.c > +++ b/arch/x86/kvm/lapic.c > @@ -1642,7 +1642,9 @@ static void apic_timer_expired(struct kvm_lapic *apic, > bool from_timer_fn) > } > > atomic_inc(>lapic_timer.pending); > - kvm_set_pending_timer(vcpu); > + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); > + if (from_timer_fn) > + kvm_vcpu_kick(vcpu); > } > > static void start_sw_tscdeadline(struct kvm_lapic *apic) > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 599d732..51b74d0 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -1778,12 +1778,6 @@ static s64 get_kvmclock_base_ns(void) > } > #endif > > -void kvm_set_pending_timer(struct kvm_vcpu *vcpu) > -{ > - kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); > - kvm_vcpu_kick(vcpu); > -} > - > static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) > { > int version; > diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h > index 995ab69..ea20b8b 100644 > --- a/arch/x86/kvm/x86.h > +++ b/arch/x86/kvm/x86.h > @@ -246,7 +246,6 @@ static inline bool kvm_vcpu_latch_init(struct kvm_vcpu > *vcpu) > return is_smm(vcpu) || kvm_x86_ops.apic_init_signal_blocked(vcpu); > } > > -void kvm_set_pending_timer(struct kvm_vcpu *vcpu); > void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int > inc_eip); > > void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); > -- > 2.7.4 >
Re: [PATCH 1/2] KVM: LAPIC: Fix updating DFR missing apic map recalculation
ping, :) On Wed, 19 Aug 2020 at 16:55, Wanpeng Li wrote: > > From: Wanpeng Li > > There is missing apic map recalculation after updating DFR, if it is > INIT RESET, in x2apic mode, local apic is software enabled before. > This patch fix it by introducing the function kvm_apic_set_dfr() to > be called in INIT RESET handling path. > > Signed-off-by: Wanpeng Li > --- > arch/x86/kvm/lapic.c | 15 ++- > 1 file changed, 10 insertions(+), 5 deletions(-) > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c > index 5ccbee7..248095a 100644 > --- a/arch/x86/kvm/lapic.c > +++ b/arch/x86/kvm/lapic.c > @@ -310,6 +310,12 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic > *apic, u32 id) > atomic_set_release(>vcpu->kvm->arch.apic_map_dirty, DIRTY); > } > > +static inline void kvm_apic_set_dfr(struct kvm_lapic *apic, u32 val) > +{ > + kvm_lapic_set_reg(apic, APIC_DFR, val); > + atomic_set_release(>vcpu->kvm->arch.apic_map_dirty, DIRTY); > +} > + > static inline u32 kvm_apic_calc_x2apic_ldr(u32 id) > { > return ((id >> 4) << 16) | (1 << (id & 0xf)); > @@ -1984,10 +1990,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 > reg, u32 val) > break; > > case APIC_DFR: > - if (!apic_x2apic_mode(apic)) { > - kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFF); > - > atomic_set_release(>vcpu->kvm->arch.apic_map_dirty, DIRTY); > - } else > + if (!apic_x2apic_mode(apic)) > + kvm_apic_set_dfr(apic, val | 0x0FFF); > + else > ret = 1; > break; > > @@ -2303,7 +2308,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool > init_event) > SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); > apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); > > - kvm_lapic_set_reg(apic, APIC_DFR, 0xU); > + kvm_apic_set_dfr(apic, 0xU); > apic_set_spiv(apic, 0xff); > kvm_lapic_set_reg(apic, APIC_TASKPRI, 0); > if (!apic_x2apic_mode(apic)) > -- > 2.7.4 >
Re: [PATCH v2 2/2] KVM: LAPIC: Guarantee the timer is in tsc-deadline mode when setting
ping :) On Wed, 12 Aug 2020 at 14:30, Wanpeng Li wrote: > > From: Wanpeng Li > > Check apic_lvtt_tscdeadline() mode directly instead of apic_lvtt_oneshot() > and apic_lvtt_period() to guarantee the timer is in tsc-deadline mode when > wrmsr MSR_IA32_TSCDEADLINE. > > Signed-off-by: Wanpeng Li > --- > v1 -> v2: > * fix indentation > > arch/x86/kvm/lapic.c | 3 +-- > 1 file changed, 1 insertion(+), 2 deletions(-) > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c > index 79599af..abaf48e 100644 > --- a/arch/x86/kvm/lapic.c > +++ b/arch/x86/kvm/lapic.c > @@ -2193,8 +2193,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu > *vcpu, u64 data) > { > struct kvm_lapic *apic = vcpu->arch.apic; > > - if (!kvm_apic_present(vcpu) || apic_lvtt_oneshot(apic) || > - apic_lvtt_period(apic)) > + if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic)) > return; > > hrtimer_cancel(>lapic_timer.timer); > -- > 2.7.4 >
[PATCH v2] KVM: LAPIC: Narrow down the kick target vCPU
From: Wanpeng Li The kick after setting KVM_REQ_PENDING_TIMER is used to handle the timer fires on a different pCPU which vCPU is running on, this kick is expensive since memory barrier, rcu, preemption disable/enable operations. We don't need this kick when injecting already-expired timer, we also should call out the VMX preemption timer case, which also passes from_timer_fn=false but doesn't need a kick because kvm_lapic_expired_hv_timer() is called from the target vCPU. Signed-off-by: Wanpeng Li --- v1 -> v2: * update patch subject and changelog * open code kvm_set_pending_timer() arch/x86/kvm/lapic.c | 4 +++- arch/x86/kvm/x86.c | 6 -- arch/x86/kvm/x86.h | 1 - 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 248095a..97f1dbf 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1642,7 +1642,9 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn) } atomic_inc(>lapic_timer.pending); - kvm_set_pending_timer(vcpu); + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + if (from_timer_fn) + kvm_vcpu_kick(vcpu); } static void start_sw_tscdeadline(struct kvm_lapic *apic) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 599d732..51b74d0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1778,12 +1778,6 @@ static s64 get_kvmclock_base_ns(void) } #endif -void kvm_set_pending_timer(struct kvm_vcpu *vcpu) -{ - kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); - kvm_vcpu_kick(vcpu); -} - static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { int version; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 995ab69..ea20b8b 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -246,7 +246,6 @@ static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu) return is_smm(vcpu) || kvm_x86_ops.apic_init_signal_blocked(vcpu); } -void kvm_set_pending_timer(struct kvm_vcpu *vcpu); void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); -- 2.7.4