from:"Wanpeng Li"

Re: [PATCH] KVM: Boost vCPU candidiate in user mode which is delivering interrupt

2021-04-20 Thread Wanpeng Li

On Tue, 20 Apr 2021 at 18:23, Paolo Bonzini  wrote:
>
> On 20/04/21 10:48, Wanpeng Li wrote:
> >> I was thinking of something simpler:
> >>
> >> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> >> index 9b8e30dd5b9b..455c648f9adc 100644
> >> --- a/virt/kvm/kvm_main.c
> >> +++ b/virt/kvm/kvm_main.c
> >> @@ -3198,10 +3198,9 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool 
> >> yield_to_kernel_mode)
> >>{
> >>  struct kvm *kvm = me->kvm;
> >>  struct kvm_vcpu *vcpu;
> >> -   int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
> >>  int yielded = 0;
> >>  int try = 3;
> >> -   int pass;
> >> +   int pass, num_passes = 1;
> >>  int i;
> >>
> >>  kvm_vcpu_set_in_spin_loop(me, true);
> >> @@ -3212,13 +3211,14 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool 
> >> yield_to_kernel_mode)
> >>   * VCPU is holding the lock that we need and will release it.
> >>   * We approximate round-robin by starting at the last boosted 
> >> VCPU.
> >>   */
> >> -   for (pass = 0; pass < 2 && !yielded && try; pass++) {
> >> -   kvm_for_each_vcpu(i, vcpu, kvm) {
> >> -   if (!pass && i <= last_boosted_vcpu) {
> >> -   i = last_boosted_vcpu;
> >> -   continue;
> >> -   } else if (pass && i > last_boosted_vcpu)
> >> -   break;
> >> +   for (pass = 0; pass < num_passes; pass++) {
> >> +   int idx = me->kvm->last_boosted_vcpu;
> >> +   int n = atomic_read(>online_vcpus);
> >> +   for (i = 0; i < n; i++, idx++) {
> >> +   if (idx == n)
> >> +   idx = 0;
> >> +
> >> +   vcpu = kvm_get_vcpu(kvm, idx);
> >>  if (!READ_ONCE(vcpu->ready))
> >>  continue;
> >>  if (vcpu == me)
> >> @@ -3226,23 +3226,36 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool 
> >> yield_to_kernel_mode)
> >>  if (rcuwait_active(>wait) &&
> >>  !vcpu_dy_runnable(vcpu))
> >>  continue;
> >> -   if (READ_ONCE(vcpu->preempted) && 
> >> yield_to_kernel_mode &&
> >> -   !kvm_arch_vcpu_in_kernel(vcpu))
> >> -   continue;
> >>  if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
> >>  continue;
> >>
> >> +   if (READ_ONCE(vcpu->preempted) && 
> >> yield_to_kernel_mode &&
> >> +   !kvm_arch_vcpu_in_kernel(vcpu)) {
> >> +   /*
> >> +* A vCPU running in userspace can get to 
> >> kernel mode via
> >> +* an interrupt.  That's a worse choice than a 
> >> CPU already
> >> +* in kernel mode so only do it on a second 
> >> pass.
> >> +*/
> >> +   if (!vcpu_dy_runnable(vcpu))
> >> +   continue;
> >> +   if (pass == 0) {
> >> +   num_passes = 2;
> >> +   continue;
> >> +   }
> >> +   }
> >> +
> >>  yielded = kvm_vcpu_yield_to(vcpu);
> >>  if (yielded > 0) {
> >>  kvm->last_boosted_vcpu = i;
> >> -   break;
> >> +   goto done;
> >>  } else if (yielded < 0) {
> >>  try--;
> >>  if (!try)
> >> -   break;
> >> +   goto done;
> >>  }
> >>  }
> >>  }
> >> +done:
> >
> > We just tested the above post against 96 vCPUs VM in an over-subscribe
> > scenario, the score of pbzip2 fluctuated drastically. Sometimes it is
> > worse than vanilla, but the average improvement is around 2.2%. The
> > new version of my post is around 9.3%，the origial posted patch is
> > around 10% which is totally as expected since now both IPI receivers
> > in user-mode and lock-waiters are second class citizens.
>
> Fair enough.  Of the two patches you posted I prefer the original, so
> I'll go with that one.

Great! Thanks. :)

Wanpeng

Re: [PATCH] KVM: Boost vCPU candidiate in user mode which is delivering interrupt

2021-04-20 Thread Wanpeng Li

On Tue, 20 Apr 2021 at 15:23, Paolo Bonzini  wrote:
>
> On 20/04/21 08:08, Wanpeng Li wrote:
> > On Tue, 20 Apr 2021 at 14:02, Wanpeng Li  wrote:
> >>
> >> On Tue, 20 Apr 2021 at 00:59, Paolo Bonzini  wrote:
> >>>
> >>> On 19/04/21 18:32, Sean Christopherson wrote:
> >>>> If false positives are a big concern, what about adding another pass to 
> >>>> the loop
> >>>> and only yielding to usermode vCPUs with interrupts in the second full 
> >>>> pass?
> >>>> I.e. give vCPUs that are already in kernel mode priority, and only yield 
> >>>> to
> >>>> handle an interrupt if there are no vCPUs in kernel mode.
> >>>>
> >>>> kvm_arch_dy_runnable() pulls in pv_unhalted, which seems like a good 
> >>>> thing.
> >>>
> >>> pv_unhalted won't help if you're waiting for a kernel spinlock though,
> >>> would it?  Doing two passes (or looking for a "best" candidate that
> >>> prefers kernel mode vCPUs to user mode vCPUs waiting for an interrupt)
> >>> seems like the best choice overall.
> >>
> >> How about something like this:
>
> I was thinking of something simpler:
>
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 9b8e30dd5b9b..455c648f9adc 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -3198,10 +3198,9 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool 
> yield_to_kernel_mode)
>   {
> struct kvm *kvm = me->kvm;
> struct kvm_vcpu *vcpu;
> -   int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
> int yielded = 0;
> int try = 3;
> -   int pass;
> +   int pass, num_passes = 1;
> int i;
>
> kvm_vcpu_set_in_spin_loop(me, true);
> @@ -3212,13 +3211,14 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool 
> yield_to_kernel_mode)
>  * VCPU is holding the lock that we need and will release it.
>  * We approximate round-robin by starting at the last boosted VCPU.
>  */
> -   for (pass = 0; pass < 2 && !yielded && try; pass++) {
> -   kvm_for_each_vcpu(i, vcpu, kvm) {
> -   if (!pass && i <= last_boosted_vcpu) {
> -   i = last_boosted_vcpu;
> -   continue;
> -   } else if (pass && i > last_boosted_vcpu)
> -   break;
> +   for (pass = 0; pass < num_passes; pass++) {
> +   int idx = me->kvm->last_boosted_vcpu;
> +   int n = atomic_read(>online_vcpus);
> +   for (i = 0; i < n; i++, idx++) {
> +   if (idx == n)
> +   idx = 0;
> +
> +   vcpu = kvm_get_vcpu(kvm, idx);
> if (!READ_ONCE(vcpu->ready))
> continue;
> if (vcpu == me)
> @@ -3226,23 +3226,36 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool 
> yield_to_kernel_mode)
> if (rcuwait_active(>wait) &&
> !vcpu_dy_runnable(vcpu))
> continue;
> -   if (READ_ONCE(vcpu->preempted) && 
> yield_to_kernel_mode &&
> -   !kvm_arch_vcpu_in_kernel(vcpu))
> -   continue;
> if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
> continue;
>
> +   if (READ_ONCE(vcpu->preempted) && 
> yield_to_kernel_mode &&
> +   !kvm_arch_vcpu_in_kernel(vcpu)) {
> +   /*
> +* A vCPU running in userspace can get to kernel 
> mode via
> +* an interrupt.  That's a worse choice than a 
> CPU already
> +* in kernel mode so only do it on a second pass.
> +*/
> +   if (!vcpu_dy_runnable(vcpu))
> +   continue;
> +   if (pass == 0) {
> +   num_passes = 2;
> +   continue;
> +   }
> +   }
> +
> yielded = kvm_vcpu_yield_to(vcpu);
> if (yielded > 0) {
> kvm->last_boosted_vcpu = i;
> -

Re: [PATCH] KVM: Boost vCPU candidiate in user mode which is delivering interrupt

2021-04-20 Thread Wanpeng Li

On Tue, 20 Apr 2021 at 14:02, Wanpeng Li  wrote:
>
> On Tue, 20 Apr 2021 at 00:59, Paolo Bonzini  wrote:
> >
> > On 19/04/21 18:32, Sean Christopherson wrote:
> > > If false positives are a big concern, what about adding another pass to 
> > > the loop
> > > and only yielding to usermode vCPUs with interrupts in the second full 
> > > pass?
> > > I.e. give vCPUs that are already in kernel mode priority, and only yield 
> > > to
> > > handle an interrupt if there are no vCPUs in kernel mode.
> > >
> > > kvm_arch_dy_runnable() pulls in pv_unhalted, which seems like a good 
> > > thing.
> >
> > pv_unhalted won't help if you're waiting for a kernel spinlock though,
> > would it?  Doing two passes (or looking for a "best" candidate that
> > prefers kernel mode vCPUs to user mode vCPUs waiting for an interrupt)
> > seems like the best choice overall.
>
> How about something like this:

Sorry, should be the codes below:

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6b4dd95..9bc5f87 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -325,10 +325,12 @@ struct kvm_vcpu {
  * Cpu relax intercept or pause loop exit optimization
  * in_spin_loop: set when a vcpu does a pause loop exit
  *  or cpu relax intercepted.
+ * pending_interrupt: set when a vcpu waiting for an interrupt
  * dy_eligible: indicates whether vcpu is eligible for directed yield.
  */
 struct {
 bool in_spin_loop;
+bool pending_interrupt;
 bool dy_eligible;
 } spin_loop;
 #endif
@@ -1427,6 +1429,12 @@ static inline void
kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
 {
 vcpu->spin_loop.in_spin_loop = val;
 }
+
+static inline void kvm_vcpu_set_pending_interrupt(struct kvm_vcpu
*vcpu, bool val)
+{
+vcpu->spin_loop.pending_interrupt = val;
+}
+
 static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
 {
 vcpu->spin_loop.dy_eligible = val;
@@ -1438,6 +1446,10 @@ static inline void
kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
 {
 }

+static inline void kvm_vcpu_set_pending_interrupt(struct kvm_vcpu
*vcpu, bool val)
+{
+}
+
 static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
 {
 }
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 529cff1..bf6f1ec 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -410,6 +410,7 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu,
struct kvm *kvm, unsigned id)
 INIT_LIST_HEAD(>blocked_vcpu_list);

 kvm_vcpu_set_in_spin_loop(vcpu, false);
+kvm_vcpu_set_pending_interrupt(vcpu, false);
 kvm_vcpu_set_dy_eligible(vcpu, false);
 vcpu->preempted = false;
 vcpu->ready = false;
@@ -3079,14 +3080,17 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
  * Helper that checks whether a VCPU is eligible for directed yield.
  * Most eligible candidate to yield is decided by following heuristics:
  *
- *  (a) VCPU which has not done pl-exit or cpu relax intercepted recently
- *  (preempted lock holder), indicated by @in_spin_loop.
- *  Set at the beginning and cleared at the end of interception/PLE handler.
+ *  (a) VCPU which has not done pl-exit or cpu relax intercepted and is not
+ *  waiting for an interrupt recently (preempted lock holder). The former
+ *  one is indicated by @in_spin_loop, set at the beginning and cleared at
+ *  the end of interception/PLE handler. The later one is indicated by
+ *  @pending_interrupt, set when interrupt is delivering and cleared at
+ *  the end of directed yield.
  *
- *  (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
- *  chance last time (mostly it has become eligible now since we have probably
- *  yielded to lockholder in last iteration. This is done by toggling
- *  @dy_eligible each time a VCPU checked for eligibility.)
+ *  (b) VCPU which has done pl-exit/ cpu relax intercepted or is waiting for
+ *  interrupt but did not get chance last time (mostly it has become eligible
+ *  now since we have probably yielded to lockholder in last iteration. This
+ *  is done by toggling @dy_eligible each time a VCPU checked for eligibility.)
  *
  *  Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
  *  to preempted lock-holder could result in wrong VCPU selection and CPU
@@ -3102,10 +3106,10 @@ static bool
kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
 bool eligible;

-eligible = !vcpu->spin_loop.in_spin_loop ||
+eligible = !(vcpu->spin_loop.in_spin_loop ||
vcpu->spin_loop.pending_interrupt) ||
 vcpu->spin_loop.dy_eligible;

-if (vcpu->spin_loop.in_spin_loop)
+if (vcpu->spin_loop.in_spin_loop || vcpu->spin_loop.pending_interrupt)
 kv

Re: [PATCH] KVM: Boost vCPU candidiate in user mode which is delivering interrupt

2021-04-20 Thread Wanpeng Li

On Tue, 20 Apr 2021 at 00:59, Paolo Bonzini  wrote:
>
> On 19/04/21 18:32, Sean Christopherson wrote:
> > If false positives are a big concern, what about adding another pass to the 
> > loop
> > and only yielding to usermode vCPUs with interrupts in the second full pass?
> > I.e. give vCPUs that are already in kernel mode priority, and only yield to
> > handle an interrupt if there are no vCPUs in kernel mode.
> >
> > kvm_arch_dy_runnable() pulls in pv_unhalted, which seems like a good thing.
>
> pv_unhalted won't help if you're waiting for a kernel spinlock though,
> would it?  Doing two passes (or looking for a "best" candidate that
> prefers kernel mode vCPUs to user mode vCPUs waiting for an interrupt)
> seems like the best choice overall.

How about something like this:

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6b4dd95..8ba50be 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -325,10 +325,12 @@ struct kvm_vcpu {
  * Cpu relax intercept or pause loop exit optimization
  * in_spin_loop: set when a vcpu does a pause loop exit
  *  or cpu relax intercepted.
+ * pending_interrupt: set when a vcpu waiting for an interrupt
  * dy_eligible: indicates whether vcpu is eligible for directed yield.
  */
 struct {
 bool in_spin_loop;
+bool pending_interrupt;
 bool dy_eligible;
 } spin_loop;
 #endif
@@ -1427,6 +1429,12 @@ static inline void
kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
 {
 vcpu->spin_loop.in_spin_loop = val;
 }
+
+static inline void kvm_vcpu_set_pending_interrupt(struct kvm_vcpu
*vcpu, bool val)
+{
+vcpu->spin_loop.pending__interrupt = val;
+}
+
 static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
 {
 vcpu->spin_loop.dy_eligible = val;
@@ -1438,6 +1446,10 @@ static inline void
kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
 {
 }

+static inline void kvm_vcpu_set_pending_interrupt(struct kvm_vcpu
*vcpu, bool val)
+{
+}
+
 static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
 {
 }
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 529cff1..42e0255 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -410,6 +410,7 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu,
struct kvm *kvm, unsigned id)
 INIT_LIST_HEAD(>blocked_vcpu_list);

 kvm_vcpu_set_in_spin_loop(vcpu, false);
+kvm_vcpu_set_pending_interrupt(vcpu, false);
 kvm_vcpu_set_dy_eligible(vcpu, false);
 vcpu->preempted = false;
 vcpu->ready = false;
@@ -3079,14 +3080,17 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
  * Helper that checks whether a VCPU is eligible for directed yield.
  * Most eligible candidate to yield is decided by following heuristics:
  *
- *  (a) VCPU which has not done pl-exit or cpu relax intercepted recently
- *  (preempted lock holder), indicated by @in_spin_loop.
- *  Set at the beginning and cleared at the end of interception/PLE handler.
+ *  (a) VCPU which has not done pl-exit or cpu relax intercepted and is not
+ *  waiting for an interrupt recently (preempted lock holder). The former
+ *  one is indicated by @in_spin_loop, set at the beginning and cleared at
+ *  the end of interception/PLE handler. The later one is indicated by
+ *  @pending_interrupt, set when interrupt is delivering and cleared at
+ *  the end of directed yield.
  *
- *  (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
- *  chance last time (mostly it has become eligible now since we have probably
- *  yielded to lockholder in last iteration. This is done by toggling
- *  @dy_eligible each time a VCPU checked for eligibility.)
+ *  (b) VCPU which has done pl-exit/ cpu relax intercepted or is waiting for
+ *  interrupt but did not get chance last time (mostly it has become eligible
+ *  now since we have probably yielded to lockholder in last iteration. This
+ *  is done by toggling @dy_eligible each time a VCPU checked for eligibility.)
  *
  *  Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
  *  to preempted lock-holder could result in wrong VCPU selection and CPU
@@ -3102,10 +3106,10 @@ static bool
kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
 bool eligible;

-eligible = !vcpu->spin_loop.in_spin_loop ||
+eligible = !(vcpu->spin_loop.in_spin_loop ||
vcpu->spin_loop.has_interrupt) ||
 vcpu->spin_loop.dy_eligible;

-if (vcpu->spin_loop.in_spin_loop)
+if (vcpu->spin_loop.in_spin_loop || vcpu->spin_loop.has_interrupt)
 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);

 return eligible;
@@ -3137,6 +3141,16 @@ static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
 return false;
 }

+static bool kvm_has_interrupt_delivery(struct kvm_vcpu *vcpu)
+{
+if (vcpu_dy_runnable(vcpu)) {
+kvm_vcpu_set_pending_interrupt(vcpu, true);
+return true;

Re: [PATCH v2 09/10] KVM: Don't take mmu_lock for range invalidation unless necessary

2021-04-19 Thread Wanpeng Li

On Fri, 2 Apr 2021 at 08:59, Sean Christopherson  wrote:
>
> Avoid taking mmu_lock for unrelated .invalidate_range_{start,end}()
> notifications.  Because mmu_notifier_count must be modified while holding
> mmu_lock for write, and must always be paired across start->end to stay
> balanced, lock elision must happen in both or none.  To meet that
> requirement, add a rwsem to prevent memslot updates across range_start()
> and range_end().
>
> Use a rwsem instead of a rwlock since most notifiers _allow_ blocking,
> and the lock will be endl across the entire start() ... end() sequence.
> If anything in the sequence sleeps, including the caller or a different
> notifier, holding the spinlock would be disastrous.
>
> For notifiers that _disallow_ blocking, e.g. OOM reaping, simply go down
> the slow path of unconditionally acquiring mmu_lock.  The sane
> alternative would be to try to acquire the lock and force the notifier
> to retry on failure.  But since OOM is currently the _only_ scenario
> where blocking is disallowed attempting to optimize a guest that has been
> marked for death is pointless.
>
> Unconditionally define and use mmu_notifier_slots_lock in the memslots
> code, purely to avoid more #ifdefs.  The overhead of acquiring the lock
> is negligible when the lock is uncontested, which will always be the case
> when the MMU notifiers are not used.
>
> Note, technically flag-only memslot updates could be allowed in parallel,
> but stalling a memslot update for a relatively short amount of time is
> not a scalability issue, and this is all more than complex enough.
>
> Based heavily on code from Ben Gardon.
>
> Suggested-by: Ben Gardon 
> Signed-off-by: Sean Christopherson 

I saw this splatting:

 ==
 WARNING: possible circular locking dependency detected
 5.12.0-rc3+ #6 Tainted: G   OE
 --
 qemu-system-x86/3069 is trying to acquire lock:
 9c775ca0 (mmu_notifier_invalidate_range_start){+.+.}-{0:0},
at: __mmu_notifier_invalidate_range_end+0x5/0x190

 but task is already holding lock:
 aff7410a9160 (>mmu_notifier_slots_lock){.+.+}-{3:3}, at:
kvm_mmu_notifier_invalidate_range_start+0x36d/0x4f0 [kvm]

 which lock already depends on the new lock.


 the existing dependency chain (in reverse order) is:

 -> #1 (>mmu_notifier_slots_lock){.+.+}-{3:3}:
down_read+0x48/0x250
kvm_mmu_notifier_invalidate_range_start+0x36d/0x4f0 [kvm]
__mmu_notifier_invalidate_range_start+0xe8/0x260
wp_page_copy+0x82b/0xa30
do_wp_page+0xde/0x420
__handle_mm_fault+0x935/0x1230
handle_mm_fault+0x179/0x420
do_user_addr_fault+0x1b3/0x690
exc_page_fault+0x82/0x2b0
asm_exc_page_fault+0x1e/0x30

 -> #0 (mmu_notifier_invalidate_range_start){+.+.}-{0:0}:
__lock_acquire+0x110f/0x1980
lock_acquire+0x1bc/0x400
__mmu_notifier_invalidate_range_end+0x47/0x190
wp_page_copy+0x796/0xa30
do_wp_page+0xde/0x420
__handle_mm_fault+0x935/0x1230
handle_mm_fault+0x179/0x420
do_user_addr_fault+0x1b3/0x690
exc_page_fault+0x82/0x2b0
asm_exc_page_fault+0x1e/0x30

 other info that might help us debug this:

  Possible unsafe locking scenario:

CPU0CPU1

   lock(>mmu_notifier_slots_lock);
lock(mmu_notifier_invalidate_range_start);
lock(>mmu_notifier_slots_lock);
   lock(mmu_notifier_invalidate_range_start);

  *** DEADLOCK ***

 2 locks held by qemu-system-x86/3069:
  #0: 9e4269f8a9e0 (>mmap_lock#2){}-{3:3}, at:
do_user_addr_fault+0x10e/0x690
  #1: aff7410a9160 (>mmu_notifier_slots_lock){.+.+}-{3:3},
at: kvm_mmu_notifier_invalidate_range_start+0x36d/0x4f0 [kvm]

 stack backtrace:
 CPU: 0 PID: 3069 Comm: qemu-system-x86 Tainted: G   OE
5.12.0-rc3+ #6
 Hardware name: LENOVO ThinkCentre M8500t-N000/SHARKBAY, BIOS
FBKTC1AUS 02/16/2016
 Call Trace:
  dump_stack+0x87/0xb7
  print_circular_bug.isra.39+0x1b4/0x210
  check_noncircular+0x103/0x150
  __lock_acquire+0x110f/0x1980
  ? __lock_acquire+0x110f/0x1980
  lock_acquire+0x1bc/0x400
  ? __mmu_notifier_invalidate_range_end+0x5/0x190
  ? find_held_lock+0x40/0xb0
  __mmu_notifier_invalidate_range_end+0x47/0x190
  ? __mmu_notifier_invalidate_range_end+0x5/0x190
  wp_page_copy+0x796/0xa30
  do_wp_page+0xde/0x420
  __handle_mm_fault+0x935/0x1230
  handle_mm_fault+0x179/0x420
  do_user_addr_fault+0x1b3/0x690
  ? rcu_read_lock_sched_held+0x4f/0x80
  exc_page_fault+0x82/0x2b0
  ? asm_exc_page_fault+0x8/0x30
  asm_exc_page_fault+0x1e/0x30
 RIP: 0033:0x55f5bef2560f

Re: [PATCH] KVM: Boost vCPU candidiate in user mode which is delivering interrupt

2021-04-19 Thread Wanpeng Li

On Sat, 17 Apr 2021 at 21:09, Paolo Bonzini  wrote:
>
> On 16/04/21 05:08, Wanpeng Li wrote:
> > From: Wanpeng Li 
> >
> > Both lock holder vCPU and IPI receiver that has halted are condidate for
> > boost. However, the PLE handler was originally designed to deal with the
> > lock holder preemption problem. The Intel PLE occurs when the spinlock
> > waiter is in kernel mode. This assumption doesn't hold for IPI receiver,
> > they can be in either kernel or user mode. the vCPU candidate in user mode
> > will not be boosted even if they should respond to IPIs. Some benchmarks
> > like pbzip2, swaptions etc do the TLB shootdown in kernel mode and most
> > of the time they are running in user mode. It can lead to a large number
> > of continuous PLE events because the IPI sender causes PLE events
> > repeatedly until the receiver is scheduled while the receiver is not
> > candidate for a boost.
> >
> > This patch boosts the vCPU candidiate in user mode which is delivery
> > interrupt. We can observe the speed of pbzip2 improves 10% in 96 vCPUs
> > VM in over-subscribe scenario (The host machine is 2 socket, 48 cores,
> > 96 HTs Intel CLX box). There is no performance regression for other
> > benchmarks like Unixbench spawn (most of the time contend read/write
> > lock in kernel mode), ebizzy (most of the time contend read/write sem
> > and TLB shoodtdown in kernel mode).
> >
> > +bool kvm_arch_interrupt_delivery(struct kvm_vcpu *vcpu)
> > +{
> > + if (vcpu->arch.apicv_active && 
> > static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
> > + return true;
> > +
> > + return false;
> > +}
>
> Can you reuse vcpu_dy_runnable instead of this new function?

I have some concerns. For x86 arch, vcpu_dy_runnable() will add extra
vCPU candidates by KVM_REQ_EVENT and async pf(which has already
opportunistically made the guest do other stuff). For other arches,
kvm_arch_dy_runnale() is equal to kvm_arch_vcpu_runnable() except
powerpc which has too many events and is not conservative. In general,
 vcpu_dy_runnable() will loose the conditions and add more vCPU
candidates.

Wanpeng

[PATCH] KVM: Boost vCPU candidiate in user mode which is delivering interrupt

2021-04-15 Thread Wanpeng Li

From: Wanpeng Li 

Both lock holder vCPU and IPI receiver that has halted are condidate for 
boost. However, the PLE handler was originally designed to deal with the 
lock holder preemption problem. The Intel PLE occurs when the spinlock 
waiter is in kernel mode. This assumption doesn't hold for IPI receiver, 
they can be in either kernel or user mode. the vCPU candidate in user mode 
will not be boosted even if they should respond to IPIs. Some benchmarks 
like pbzip2, swaptions etc do the TLB shootdown in kernel mode and most
of the time they are running in user mode. It can lead to a large number 
of continuous PLE events because the IPI sender causes PLE events 
repeatedly until the receiver is scheduled while the receiver is not 
candidate for a boost.

This patch boosts the vCPU candidiate in user mode which is delivery 
interrupt. We can observe the speed of pbzip2 improves 10% in 96 vCPUs 
VM in over-subscribe scenario (The host machine is 2 socket, 48 cores, 
96 HTs Intel CLX box). There is no performance regression for other 
benchmarks like Unixbench spawn (most of the time contend read/write 
lock in kernel mode), ebizzy (most of the time contend read/write sem 
and TLB shoodtdown in kernel mode).

Signed-off-by: Wanpeng Li 
---
 arch/x86/kvm/x86.c   | 8 
 include/linux/kvm_host.h | 1 +
 virt/kvm/kvm_main.c  | 6 ++
 3 files changed, 15 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0d2dd3f..0f16fa5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -11069,6 +11069,14 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
return false;
 }
 
+bool kvm_arch_interrupt_delivery(struct kvm_vcpu *vcpu)
+{
+   if (vcpu->arch.apicv_active && 
static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
+   return true;
+
+   return false;
+}
+
 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
 {
return vcpu->arch.preempted_in_kernel;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3b06d12..5012fc4 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -954,6 +954,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu);
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu);
 bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu);
+bool kvm_arch_interrupt_delivery(struct kvm_vcpu *vcpu);
 int kvm_arch_post_init_vm(struct kvm *kvm);
 void kvm_arch_pre_destroy_vm(struct kvm *kvm);
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 0a481e7..781d2db 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3012,6 +3012,11 @@ static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
return false;
 }
 
+bool __weak kvm_arch_interrupt_delivery(struct kvm_vcpu *vcpu)
+{
+   return false;
+}
+
 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 {
struct kvm *kvm = me->kvm;
@@ -3045,6 +3050,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool 
yield_to_kernel_mode)
!vcpu_dy_runnable(vcpu))
continue;
if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode 
&&
+   !kvm_arch_interrupt_delivery(vcpu) &&
!kvm_arch_vcpu_in_kernel(vcpu))
continue;
if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
-- 
2.7.4

Re: [PATCH v2 0/3] KVM: Properly account for guest CPU time

2021-04-14 Thread Wanpeng Li

On Thu, 15 Apr 2021 at 08:49, Sean Christopherson  wrote:
>
> On Wed, Apr 14, 2021, Wanpeng Li wrote:
> > On Wed, 14 Apr 2021 at 01:25, Sean Christopherson  wrote:
> > >
> > > On Tue, Apr 13, 2021, Wanpeng Li wrote:
> > > > The bugzilla https://bugzilla.kernel.org/show_bug.cgi?id=209831
> > > > reported that the guest time remains 0 when running a while true
> > > > loop in the guest.
> > > >
> > > > The commit 87fa7f3e98a131 ("x86/kvm: Move context tracking where it
> > > > belongs") moves guest_exit_irqoff() close to vmexit breaks the
> > > > tick-based time accouting when the ticks that happen after IRQs are
> > > > disabled are incorrectly accounted to the host/system time. This is
> > > > because we exit the guest state too early.
> > > >
> > > > This patchset splits both context tracking logic and the time accounting
> > > > logic from guest_enter/exit_irqoff(), keep context tracking around the
> > > > actual vmentry/exit code, have the virt time specific helpers which
> > > > can be placed at the proper spots in kvm. In addition, it will not
> > > > break the world outside of x86.
> > >
> > > IMO, this is going in the wrong direction.  Rather than separate context 
> > > tracking,
> > > vtime accounting, and KVM logic, this further intertwines the three.  
> > > E.g. the
> > > context tracking code has even more vtime accounting NATIVE vs. GEN vs. 
> > > TICK
> > > logic baked into it.
> > >
> > > Rather than smush everything into context_tracking.h, I think we can 
> > > cleanly
> > > split the context tracking and vtime accounting code into separate 
> > > pieces, which
> > > will in turn allow moving the wrapping logic to linux/kvm_host.h.  Once 
> > > that is
> > > done, splitting the context tracking and time accounting logic for KVM x86
> > > becomes a KVM detail as opposed to requiring dedicated logic in the 
> > > context
> > > tracking code.
> > >
> > > I have untested code that compiles on x86, I'll send an RFC shortly.
> >
> > We need an easy to backport fix and then we might have some further
> > cleanups on top.
>
> I fiddled with this a bit today, I think I have something workable that will 
> be
> a relatively clean and short backport.  With luck, I'll get it posted 
> tomorrow.

I think we should improve my posted version instead of posting a lot
of alternative versions to save everybody's time.

Wanpeng

Re: [PATCH v2 0/3] KVM: Properly account for guest CPU time

2021-04-14 Thread Wanpeng Li

On Wed, 14 Apr 2021 at 01:25, Sean Christopherson  wrote:
>
> On Tue, Apr 13, 2021, Wanpeng Li wrote:
> > The bugzilla https://bugzilla.kernel.org/show_bug.cgi?id=209831
> > reported that the guest time remains 0 when running a while true
> > loop in the guest.
> >
> > The commit 87fa7f3e98a131 ("x86/kvm: Move context tracking where it
> > belongs") moves guest_exit_irqoff() close to vmexit breaks the
> > tick-based time accouting when the ticks that happen after IRQs are
> > disabled are incorrectly accounted to the host/system time. This is
> > because we exit the guest state too early.
> >
> > This patchset splits both context tracking logic and the time accounting
> > logic from guest_enter/exit_irqoff(), keep context tracking around the
> > actual vmentry/exit code, have the virt time specific helpers which
> > can be placed at the proper spots in kvm. In addition, it will not
> > break the world outside of x86.
>
> IMO, this is going in the wrong direction.  Rather than separate context 
> tracking,
> vtime accounting, and KVM logic, this further intertwines the three.  E.g. the
> context tracking code has even more vtime accounting NATIVE vs. GEN vs. TICK
> logic baked into it.
>
> Rather than smush everything into context_tracking.h, I think we can cleanly
> split the context tracking and vtime accounting code into separate pieces, 
> which
> will in turn allow moving the wrapping logic to linux/kvm_host.h.  Once that 
> is
> done, splitting the context tracking and time accounting logic for KVM x86
> becomes a KVM detail as opposed to requiring dedicated logic in the context
> tracking code.
>
> I have untested code that compiles on x86, I'll send an RFC shortly.

We need an easy to backport fix and then we might have some further
cleanups on top.

Wanpeng

Re: [PATCH v2 1/3] context_tracking: Split guest_enter/exit_irqoff

2021-04-13 Thread Wanpeng Li

On Tue, 13 Apr 2021 at 15:48, Christian Borntraeger
 wrote:
>
>
>
> On 13.04.21 09:38, Wanpeng Li wrote:
> > On Tue, 13 Apr 2021 at 15:35, Christian Borntraeger
> >  wrote:
> >>
> >>
> >>
> >> On 13.04.21 09:16, Wanpeng Li wrote:
> >> [...]
> >>
> >>> @@ -145,6 +155,13 @@ static __always_inline void guest_exit_irqoff(void)
> >>>}
> >>>
> >>>#else
> THis is the else part
>
>
> >>> +static __always_inline void context_guest_enter_irqoff(void)
> >>> +{
> >>> + instrumentation_begin();
>
> 2nd on
> >>> + rcu_virt_note_context_switch(smp_processor_id());
> >>> + instrumentation_end();
> 2nd off
> >>> +}
> >>> +
> >>>static __always_inline void guest_enter_irqoff(void)
> >>>{
> >>>/*
> >>> @@ -155,10 +172,13 @@ static __always_inline void guest_enter_irqoff(void)
> >>>instrumentation_begin();
>
> first on
> >>>vtime_account_kernel(current);
> >>>current->flags |= PF_VCPU;
> >>> - rcu_virt_note_context_switch(smp_processor_id());
> >>>instrumentation_end();
>
> first off
> >>> +
> >>> + context_guest_enter_irqoff();
> here we call the 2nd on and off.
> >>
> >> So we now do instrumentation_begin 2 times?
> >
> > Similar to context_guest_enter_irqoff() ifdef 
> > CONFIG_VIRT_CPU_ACCOUNTING_GEN.
>
> For the
> ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN part
> context_guest_enter_irqoff()
> does not have instrumentation_begin/end.
>
> Or did I miss anything.

I mean the if (!context_tracking_enabled_this_cpu()) part in the
function context_guest_enter_irqoff() ifdef
CONFIG_VIRT_CPU_ACCOUNTING_GEN. :)

Wanpeng

Re: [PATCH v2 1/3] context_tracking: Split guest_enter/exit_irqoff

2021-04-13 Thread Wanpeng Li

On Tue, 13 Apr 2021 at 15:35, Christian Borntraeger
 wrote:
>
>
>
> On 13.04.21 09:16, Wanpeng Li wrote:
> [...]
>
> > @@ -145,6 +155,13 @@ static __always_inline void guest_exit_irqoff(void)
> >   }
> >
> >   #else
> > +static __always_inline void context_guest_enter_irqoff(void)
> > +{
> > + instrumentation_begin();
> > + rcu_virt_note_context_switch(smp_processor_id());
> > + instrumentation_end();
> > +}
> > +
> >   static __always_inline void guest_enter_irqoff(void)
> >   {
> >   /*
> > @@ -155,10 +172,13 @@ static __always_inline void guest_enter_irqoff(void)
> >   instrumentation_begin();
> >   vtime_account_kernel(current);
> >   current->flags |= PF_VCPU;
> > - rcu_virt_note_context_switch(smp_processor_id());
> >   instrumentation_end();
> > +
> > + context_guest_enter_irqoff();
>
> So we now do instrumentation_begin 2 times?

Similar to context_guest_enter_irqoff() ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN.

Wanpeng

[PATCH v2 3/3] x86/kvm: Fix vtime accounting

2021-04-13 Thread Wanpeng Li

From: Wanpeng Li 

The bugzilla https://bugzilla.kernel.org/show_bug.cgi?id=209831
reported that the guest time remains 0 when running a while true
loop in the guest.

The commit 87fa7f3e98a131 ("x86/kvm: Move context tracking where it
belongs") moves guest_exit_irqoff() close to vmexit breaks the
tick-based time accouting when the ticks that happen after IRQs are
disabled are incorrectly accounted to the host/system time. This is
because we exit the guest state too early.

Keep context tracking around the actual vmentry/exit code, the time 
accounting logic is separated out by preposed patch from 
guest_enter/exit_irqoff(). Keep vtime-based time accounting around 
the actual vmentry/exit code when vtime_accounting_enabled_this_cpu() 
is true, leave PF_VCPU clearing after handle_exit_irqoff() and explicit 
IRQ window for tick-based time accouting.

Fixes: 87fa7f3e98a131 ("x86/kvm: Move context tracking where it belongs")
Cc: Thomas Gleixner 
Cc: Sean Christopherson 
Cc: Michael Tokarev 
Cc: sta...@vger.kernel.org#v5.9-rc1+
Suggested-by: Thomas Gleixner 
Signed-off-by: Wanpeng Li 
---
 arch/x86/kvm/svm/svm.c | 6 --
 arch/x86/kvm/vmx/vmx.c | 6 --
 arch/x86/kvm/x86.c | 1 +
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 48b396f3..2a4e284 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3726,11 +3726,12 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu 
*vcpu)
 * accordingly.
 */
instrumentation_begin();
+   vtime_account_guest_enter();
trace_hardirqs_on_prepare();
lockdep_hardirqs_on_prepare(CALLER_ADDR0);
instrumentation_end();
 
-   guest_enter_irqoff();
+   context_guest_enter_irqoff();
lockdep_hardirqs_on(CALLER_ADDR0);
 
if (sev_es_guest(vcpu->kvm)) {
@@ -3758,10 +3759,11 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu 
*vcpu)
 * into world and some more.
 */
lockdep_hardirqs_off(CALLER_ADDR0);
-   guest_exit_irqoff();
+   context_guest_exit_irqoff();
 
instrumentation_begin();
trace_hardirqs_off_finish();
+   __vtime_account_guest_exit();
instrumentation_end();
 }
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index c05e6e2..23be956 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6613,11 +6613,12 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu 
*vcpu,
 * accordingly.
 */
instrumentation_begin();
+   vtime_account_guest_enter();
trace_hardirqs_on_prepare();
lockdep_hardirqs_on_prepare(CALLER_ADDR0);
instrumentation_end();
 
-   guest_enter_irqoff();
+   context_guest_enter_irqoff();
lockdep_hardirqs_on(CALLER_ADDR0);
 
/* L1D Flush includes CPU buffer clear to mitigate MDS */
@@ -6647,10 +6648,11 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu 
*vcpu,
 * into world and some more.
 */
lockdep_hardirqs_off(CALLER_ADDR0);
-   guest_exit_irqoff();
+   context_guest_exit_irqoff();
 
instrumentation_begin();
trace_hardirqs_off_finish();
+   __vtime_account_guest_exit();
instrumentation_end();
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ce9a1d2..0d2dd3f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9245,6 +9245,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
++vcpu->stat.exits;
local_irq_disable();
kvm_after_interrupt(vcpu);
+   vcpu_account_guest_exit();
 
if (lapic_in_kernel(vcpu)) {
s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
-- 
2.7.4

[PATCH v2 1/3] context_tracking: Split guest_enter/exit_irqoff

2021-04-13 Thread Wanpeng Li

From: Wanpeng Li 

Split context_tracking part from guest_enter/exit_irqoff, it will be 
called separately in later patches.

Suggested-by: Thomas Gleixner 
Cc: Thomas Gleixner 
Cc: Sean Christopherson 
Cc: Michael Tokarev 
Signed-off-by: Wanpeng Li 
---
 include/linux/context_tracking.h | 42 +---
 1 file changed, 31 insertions(+), 11 deletions(-)

diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index bceb064..d8ad844 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -104,16 +104,8 @@ static inline void context_tracking_init(void) { }
 
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-/* must be called with irqs disabled */
-static __always_inline void guest_enter_irqoff(void)
+static __always_inline void context_guest_enter_irqoff(void)
 {
-   instrumentation_begin();
-   if (vtime_accounting_enabled_this_cpu())
-   vtime_guest_enter(current);
-   else
-   current->flags |= PF_VCPU;
-   instrumentation_end();
-
if (context_tracking_enabled())
__context_tracking_enter(CONTEXT_GUEST);
 
@@ -131,10 +123,28 @@ static __always_inline void guest_enter_irqoff(void)
}
 }
 
-static __always_inline void guest_exit_irqoff(void)
+/* must be called with irqs disabled */
+static __always_inline void guest_enter_irqoff(void)
+{
+   instrumentation_begin();
+   if (vtime_accounting_enabled_this_cpu())
+   vtime_guest_enter(current);
+   else
+   current->flags |= PF_VCPU;
+   instrumentation_end();
+
+   context_guest_enter_irqoff();
+}
+
+static __always_inline void context_guest_exit_irqoff(void)
 {
if (context_tracking_enabled())
__context_tracking_exit(CONTEXT_GUEST);
+}
+
+static __always_inline void guest_exit_irqoff(void)
+{
+   context_guest_exit_irqoff();
 
instrumentation_begin();
if (vtime_accounting_enabled_this_cpu())
@@ -145,6 +155,13 @@ static __always_inline void guest_exit_irqoff(void)
 }
 
 #else
+static __always_inline void context_guest_enter_irqoff(void)
+{
+   instrumentation_begin();
+   rcu_virt_note_context_switch(smp_processor_id());
+   instrumentation_end();
+}
+
 static __always_inline void guest_enter_irqoff(void)
 {
/*
@@ -155,10 +172,13 @@ static __always_inline void guest_enter_irqoff(void)
instrumentation_begin();
vtime_account_kernel(current);
current->flags |= PF_VCPU;
-   rcu_virt_note_context_switch(smp_processor_id());
instrumentation_end();
+
+   context_guest_enter_irqoff();
 }
 
+static __always_inline void context_guest_exit_irqoff(void) { }
+
 static __always_inline void guest_exit_irqoff(void)
 {
instrumentation_begin();
-- 
2.7.4

[PATCH v2 0/3] KVM: Properly account for guest CPU time

2021-04-13 Thread Wanpeng Li

The bugzilla https://bugzilla.kernel.org/show_bug.cgi?id=209831
reported that the guest time remains 0 when running a while true
loop in the guest.

The commit 87fa7f3e98a131 ("x86/kvm: Move context tracking where it
belongs") moves guest_exit_irqoff() close to vmexit breaks the
tick-based time accouting when the ticks that happen after IRQs are
disabled are incorrectly accounted to the host/system time. This is
because we exit the guest state too early.

This patchset splits both context tracking logic and the time accounting 
logic from guest_enter/exit_irqoff(), keep context tracking around the 
actual vmentry/exit code, have the virt time specific helpers which 
can be placed at the proper spots in kvm. In addition, it will not 
break the world outside of x86.

v1 -> v2:
 * split context_tracking from guest_enter/exit_irqoff
 * provide separate vtime accounting functions for consistent
 * place the virt time specific helpers at the proper splot 

Suggested-by: Thomas Gleixner 
Cc: Thomas Gleixner 
Cc: Sean Christopherson 
Cc: Michael Tokarev 

Wanpeng Li (3):
  context_tracking: Split guest_enter/exit_irqoff
  context_tracking: Provide separate vtime accounting functions
  x86/kvm: Fix vtime accounting

 arch/x86/kvm/svm/svm.c   |  6 ++-
 arch/x86/kvm/vmx/vmx.c   |  6 ++-
 arch/x86/kvm/x86.c   |  1 +
 include/linux/context_tracking.h | 84 +++-
 4 files changed, 74 insertions(+), 23 deletions(-)

-- 
2.7.4

[PATCH v2 2/3] context_tracking: Provide separate vtime accounting functions

2021-04-13 Thread Wanpeng Li

From: Wanpeng Li 

Provide separate vtime accounting functions, because having proper 
wrappers for that case would be too consistent and less confusing.

Suggested-by: Thomas Gleixner 
Cc: Thomas Gleixner 
Cc: Sean Christopherson 
Cc: Michael Tokarev 
Signed-off-by: Wanpeng Li 
---
 include/linux/context_tracking.h | 50 ++--
 1 file changed, 38 insertions(+), 12 deletions(-)

diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index d8ad844..491f889 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -102,6 +102,40 @@ extern void context_tracking_init(void);
 static inline void context_tracking_init(void) { }
 #endif /* CONFIG_CONTEXT_TRACKING_FORCE */
 
+static __always_inline void vtime_account_guest_enter(void)
+{
+   if (IS_ENABLED(CONFIG_VIRT_CPU_ACCOUNTING_GEN)) {
+   if (vtime_accounting_enabled_this_cpu())
+   vtime_guest_enter(current);
+   else
+   current->flags |= PF_VCPU;
+   } else {
+   vtime_account_kernel(current);
+   current->flags |= PF_VCPU;
+   }
+}
+
+static __always_inline void __vtime_account_guest_exit(void)
+{
+   if (IS_ENABLED(CONFIG_VIRT_CPU_ACCOUNTING_GEN)) {
+   if (vtime_accounting_enabled_this_cpu())
+   vtime_guest_exit(current);
+   } else {
+   vtime_account_kernel(current);
+   }
+}
+
+static __always_inline void vtime_account_guest_exit(void)
+{
+   __vtime_account_guest_exit();
+   current->flags &= ~PF_VCPU;
+}
+
+static __always_inline void vcpu_account_guest_exit(void)
+{
+   if (!vtime_accounting_enabled_this_cpu())
+   current->flags &= ~PF_VCPU;
+}
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 static __always_inline void context_guest_enter_irqoff(void)
@@ -127,10 +161,7 @@ static __always_inline void 
context_guest_enter_irqoff(void)
 static __always_inline void guest_enter_irqoff(void)
 {
instrumentation_begin();
-   if (vtime_accounting_enabled_this_cpu())
-   vtime_guest_enter(current);
-   else
-   current->flags |= PF_VCPU;
+   vtime_account_guest_enter();
instrumentation_end();
 
context_guest_enter_irqoff();
@@ -147,10 +178,7 @@ static __always_inline void guest_exit_irqoff(void)
context_guest_exit_irqoff();
 
instrumentation_begin();
-   if (vtime_accounting_enabled_this_cpu())
-   vtime_guest_exit(current);
-   else
-   current->flags &= ~PF_VCPU;
+   vtime_account_guest_exit();
instrumentation_end();
 }
 
@@ -170,8 +198,7 @@ static __always_inline void guest_enter_irqoff(void)
 * to flush.
 */
instrumentation_begin();
-   vtime_account_kernel(current);
-   current->flags |= PF_VCPU;
+   vtime_account_guest_enter();
instrumentation_end();
 
context_guest_enter_irqoff();
@@ -183,8 +210,7 @@ static __always_inline void guest_exit_irqoff(void)
 {
instrumentation_begin();
/* Flush the guest cputime we spent on the guest */
-   vtime_account_kernel(current);
-   current->flags &= ~PF_VCPU;
+   vtime_account_guest_exit();
instrumentation_end();
 }
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
-- 
2.7.4

[PATCH v2 3/3] KVM: X86: Do not yield to self

2021-04-08 Thread Wanpeng Li

From: Wanpeng Li 

If the target is self we do not need to yield, we can avoid malicious
guest to play this.

Signed-off-by: Wanpeng Li 
---
v1 -> v2:
 * update comments

 arch/x86/kvm/x86.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f08e9b4..ce9a1d2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8231,6 +8231,10 @@ static void kvm_sched_yield(struct kvm_vcpu *vcpu, 
unsigned long dest_id)
if (!target || !READ_ONCE(target->ready))
goto no_yield;
 
+   /* Ignore requests to yield to self */
+   if (vcpu == target)
+   goto no_yield;
+
if (kvm_vcpu_yield_to(target) <= 0)
goto no_yield;
 
-- 
2.7.4

[PATCH v2 2/3] KVM: X86: Count attempted/successful directed yield

2021-04-08 Thread Wanpeng Li

From: Wanpeng Li 

To analyze some performance issues with lock contention and scheduling,
it is nice to know when directed yield are successful or failing.

Signed-off-by: Wanpeng Li 
---
v1 -> v2:
 * rename new vcpu stat 
 * account success instead of ignore 

 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/x86.c  | 24 ++--
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 44f8930..5af7411 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1126,6 +1126,8 @@ struct kvm_vcpu_stat {
u64 halt_poll_success_ns;
u64 halt_poll_fail_ns;
u64 nested_run;
+   u64 directed_yield_attempted;
+   u64 directed_yield_successful;
 };
 
 struct x86_instruction_info;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 16fb395..f08e9b4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -246,6 +246,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
VCPU_STAT("nested_run", nested_run),
+   VCPU_STAT("directed_yield_attempted", directed_yield_attempted),
+   VCPU_STAT("directed_yield_successful", directed_yield_successful),
VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
VM_STAT("mmu_pte_write", mmu_pte_write),
VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
@@ -8211,21 +8213,31 @@ void kvm_apicv_init(struct kvm *kvm, bool enable)
 }
 EXPORT_SYMBOL_GPL(kvm_apicv_init);
 
-static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id)
+static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
 {
struct kvm_vcpu *target = NULL;
struct kvm_apic_map *map;
 
+   vcpu->stat.directed_yield_attempted++;
+
rcu_read_lock();
-   map = rcu_dereference(kvm->arch.apic_map);
+   map = rcu_dereference(vcpu->kvm->arch.apic_map);
 
if (likely(map) && dest_id <= map->max_apic_id && 
map->phys_map[dest_id])
target = map->phys_map[dest_id]->vcpu;
 
rcu_read_unlock();
 
-   if (target && READ_ONCE(target->ready))
-   kvm_vcpu_yield_to(target);
+   if (!target || !READ_ONCE(target->ready))
+   goto no_yield;
+
+   if (kvm_vcpu_yield_to(target) <= 0)
+   goto no_yield;
+
+   vcpu->stat.directed_yield_successful++;
+
+no_yield:
+   return;
 }
 
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
@@ -8272,7 +8284,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
break;
 
kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
-   kvm_sched_yield(vcpu->kvm, a1);
+   kvm_sched_yield(vcpu, a1);
ret = 0;
break;
 #ifdef CONFIG_X86_64
@@ -8290,7 +8302,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD))
break;
 
-   kvm_sched_yield(vcpu->kvm, a0);
+   kvm_sched_yield(vcpu, a0);
ret = 0;
break;
default:
-- 
2.7.4

[PATCH v2 1/3] x86/kvm: Don't bother __pv_cpu_mask when !CONFIG_SMP

2021-04-08 Thread Wanpeng Li

From: Wanpeng Li 

Enable PV TLB shootdown when !CONFIG_SMP doesn't make sense. Let's 
move it inside CONFIG_SMP. In addition, we can avoid define and 
alloc __pv_cpu_mask when !CONFIG_SMP and get rid of 'alloc' variable 
in kvm_alloc_cpumask.

Signed-off-by: Wanpeng Li 
---
v1 -> v2:
 * shuffle things around a bit more

 arch/x86/kernel/kvm.c | 118 +++---
 1 file changed, 55 insertions(+), 63 deletions(-)

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 5e78e01..224a7a1 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -451,6 +451,10 @@ static void __init sev_map_percpu_data(void)
}
 }
 
+#ifdef CONFIG_SMP
+
+static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
+
 static bool pv_tlb_flush_supported(void)
 {
return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
@@ -458,10 +462,6 @@ static bool pv_tlb_flush_supported(void)
kvm_para_has_feature(KVM_FEATURE_STEAL_TIME));
 }
 
-static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
-
-#ifdef CONFIG_SMP
-
 static bool pv_ipi_supported(void)
 {
return kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI);
@@ -574,6 +574,49 @@ static void kvm_smp_send_call_func_ipi(const struct 
cpumask *mask)
}
 }
 
+static void kvm_flush_tlb_others(const struct cpumask *cpumask,
+   const struct flush_tlb_info *info)
+{
+   u8 state;
+   int cpu;
+   struct kvm_steal_time *src;
+   struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
+
+   cpumask_copy(flushmask, cpumask);
+   /*
+* We have to call flush only on online vCPUs. And
+* queue flush_on_enter for pre-empted vCPUs
+*/
+   for_each_cpu(cpu, flushmask) {
+   src = _cpu(steal_time, cpu);
+   state = READ_ONCE(src->preempted);
+   if ((state & KVM_VCPU_PREEMPTED)) {
+   if (try_cmpxchg(>preempted, ,
+   state | KVM_VCPU_FLUSH_TLB))
+   __cpumask_clear_cpu(cpu, flushmask);
+   }
+   }
+
+   native_flush_tlb_others(flushmask, info);
+}
+
+static __init int kvm_alloc_cpumask(void)
+{
+   int cpu;
+
+   if (!kvm_para_available() || nopv)
+   return 0;
+
+   if (pv_tlb_flush_supported() || pv_ipi_supported())
+   for_each_possible_cpu(cpu) {
+   zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, 
cpu),
+   GFP_KERNEL, cpu_to_node(cpu));
+   }
+
+   return 0;
+}
+arch_initcall(kvm_alloc_cpumask);
+
 static void __init kvm_smp_prepare_boot_cpu(void)
 {
/*
@@ -611,33 +654,8 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
local_irq_enable();
return 0;
 }
-#endif
-
-static void kvm_flush_tlb_others(const struct cpumask *cpumask,
-   const struct flush_tlb_info *info)
-{
-   u8 state;
-   int cpu;
-   struct kvm_steal_time *src;
-   struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
-
-   cpumask_copy(flushmask, cpumask);
-   /*
-* We have to call flush only on online vCPUs. And
-* queue flush_on_enter for pre-empted vCPUs
-*/
-   for_each_cpu(cpu, flushmask) {
-   src = _cpu(steal_time, cpu);
-   state = READ_ONCE(src->preempted);
-   if ((state & KVM_VCPU_PREEMPTED)) {
-   if (try_cmpxchg(>preempted, ,
-   state | KVM_VCPU_FLUSH_TLB))
-   __cpumask_clear_cpu(cpu, flushmask);
-   }
-   }
 
-   native_flush_tlb_others(flushmask, info);
-}
+#endif
 
 static void __init kvm_guest_init(void)
 {
@@ -653,12 +671,6 @@ static void __init kvm_guest_init(void)
pv_ops.time.steal_clock = kvm_steal_clock;
}
 
-   if (pv_tlb_flush_supported()) {
-   pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others;
-   pv_ops.mmu.tlb_remove_table = tlb_remove_table;
-   pr_info("KVM setup pv remote TLB flush\n");
-   }
-
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
apic_set_eoi_write(kvm_guest_apic_eoi_write);
 
@@ -668,6 +680,12 @@ static void __init kvm_guest_init(void)
}
 
 #ifdef CONFIG_SMP
+   if (pv_tlb_flush_supported()) {
+   pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others;
+   pv_ops.mmu.tlb_remove_table = tlb_remove_table;
+   pr_info("KVM setup pv remote TLB flush\n");
+   }
+
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
if (pv_sched_yield_supported()) {
smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
@@ -734,7 +752,7 @@ static uint32_t __init kvm_detect(void)
 
 static void __init kvm_apic_in

Re: [PATCH] KVM: X86: Do not yield to self

2021-04-08 Thread Wanpeng Li

On Fri, 9 Apr 2021 at 00:56, Sean Christopherson  wrote:
>
> On Thu, Apr 08, 2021, Wanpeng Li wrote:
> > From: Wanpeng Li 
> >
> > If the target is self we do not need to yield, we can avoid malicious
> > guest to play this.
> >
> > Signed-off-by: Wanpeng Li 
> > ---
> > Rebased on 
> > https://lore.kernel.org/kvm/1617697935-4158-1-git-send-email-wanpen...@tencent.com/
> >
> >  arch/x86/kvm/x86.c | 4 
> >  1 file changed, 4 insertions(+)
> >
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 43c9f9b..260650f 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -8230,6 +8230,10 @@ static void kvm_sched_yield(struct kvm_vcpu *vcpu, 
> > unsigned long dest_id)
> >   if (!target)
> >   goto no_yield;
> >
> > + /* yield to self */
>
> If you're going to bother with a comment, maybe elaborate a bit, e.g.
>
> /* Ignore requests to yield to self. */

Looks good, thanks.

Wanpeng

Re: [PATCH] KVM: X86: Count success and invalid yields

2021-04-08 Thread Wanpeng Li

On Fri, 9 Apr 2021 at 01:08, Sean Christopherson  wrote:
>
> On Tue, Apr 06, 2021, Wanpeng Li wrote:
> > From: Wanpeng Li 
> >
> > To analyze some performance issues with lock contention and scheduling,
> > it is nice to know when directed yield are successful or failing.
> >
> > Signed-off-by: Wanpeng Li 
> > ---
> >  arch/x86/include/asm/kvm_host.h |  2 ++
> >  arch/x86/kvm/x86.c  | 26 --
> >  2 files changed, 22 insertions(+), 6 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/kvm_host.h 
> > b/arch/x86/include/asm/kvm_host.h
> > index 44f8930..157bcaa 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -1126,6 +1126,8 @@ struct kvm_vcpu_stat {
> >   u64 halt_poll_success_ns;
> >   u64 halt_poll_fail_ns;
> >   u64 nested_run;
> > + u64 yield_directed;
> > + u64 yield_directed_ignore;
> >  };
> >
> >  struct x86_instruction_info;
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 16fb395..3b475cd 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -246,6 +246,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
> >   VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
> >   VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
> >   VCPU_STAT("nested_run", nested_run),
> > + VCPU_STAT("yield_directed", yield_directed),
>
> This is ambiguous, it's not clear without looking at the code if it's counting
> attempts or actual yields.
>
> > + VCPU_STAT("yield_directed_ignore", yield_directed_ignore),
>
> "ignored" also feels a bit misleading, as that implies KVM deliberately 
> ignored
> a valid request, whereas many of the failure paths are due to invalid requests
> or errors of some kind.
>
> What about mirroring the halt poll stats, i.e. track "attempted" and 
> "successful",
> as opposed to "attempted" and "ignored/failed".And maybe switched directed
> and yield?  I.e. directed_yield_attempted and directed_yield_successful.

Good suggestion.

>
> Alternatively, would it make sense to do s/directed/pv, or is that not worth 
> the
> potential risk of being wrong if a non-paravirt use case comes along?
>
> pv_yield_attempted
> pv_yield_successful
>
> >   VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
> >   VM_STAT("mmu_pte_write", mmu_pte_write),
> >   VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
> > @@ -8211,21 +8213,33 @@ void kvm_apicv_init(struct kvm *kvm, bool enable)
> >  }
> >  EXPORT_SYMBOL_GPL(kvm_apicv_init);
> >
> > -static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id)
> > +static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
> >  {
> >   struct kvm_vcpu *target = NULL;
> >   struct kvm_apic_map *map;
> >
> > + vcpu->stat.yield_directed++;
> > +
> >   rcu_read_lock();
> > - map = rcu_dereference(kvm->arch.apic_map);
> > + map = rcu_dereference(vcpu->kvm->arch.apic_map);
> >
> >   if (likely(map) && dest_id <= map->max_apic_id && 
> > map->phys_map[dest_id])
> >   target = map->phys_map[dest_id]->vcpu;
> >
> >   rcu_read_unlock();
> > + if (!target)
> > + goto no_yield;
> > +
> > + if (!READ_ONCE(target->ready))
>
> I vote to keep these checks together.  That'll also make the addition of the
> "don't yield to self" check match the order of ready vs. self in 
> kvm_vcpu_on_spin().

Do it in v2.

Wanpeng

Re: [PATCH] x86/kvm: Don't alloc __pv_cpu_mask when !CONFIG_SMP

2021-04-08 Thread Wanpeng Li

On Fri, 9 Apr 2021 at 04:20, Sean Christopherson  wrote:
>
> On Wed, Apr 07, 2021, Wanpeng Li wrote:
> > From: Wanpeng Li 
> >
> > Enable PV TLB shootdown when !CONFIG_SMP doesn't make sense. Let's move
> > it inside CONFIG_SMP. In addition, we can avoid alloc __pv_cpu_mask when
> > !CONFIG_SMP and get rid of 'alloc' variable in kvm_alloc_cpumask.
>
> ...
>
> > +static bool pv_tlb_flush_supported(void) { return false; }
> > +static bool pv_ipi_supported(void) { return false; }
> > +static void kvm_flush_tlb_others(const struct cpumask *cpumask,
> > + const struct flush_tlb_info *info) { }
> > +static void kvm_setup_pv_ipi(void) { }
>
> If you shuffle things around a bit more, you can avoid these stubs, and hide 
> the
> definition of __pv_cpu_mask behind CONFIG_SMP, too.

Thanks, I will move around.

Wanpeng

[PATCH] KVM: X86: Do not yield to self

2021-04-08 Thread Wanpeng Li

From: Wanpeng Li 

If the target is self we do not need to yield, we can avoid malicious 
guest to play this.

Signed-off-by: Wanpeng Li 
---
Rebased on 
https://lore.kernel.org/kvm/1617697935-4158-1-git-send-email-wanpen...@tencent.com/

 arch/x86/kvm/x86.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 43c9f9b..260650f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8230,6 +8230,10 @@ static void kvm_sched_yield(struct kvm_vcpu *vcpu, 
unsigned long dest_id)
if (!target)
goto no_yield;
 
+   /* yield to self */
+   if (vcpu->vcpu_id == target->vcpu_id)
+   goto no_yield;
+
if (!READ_ONCE(target->ready))
goto no_yield;
 
-- 
2.7.4

Re: [PATCH v2 07/17] KVM: x86/mmu: Check PDPTRs before allocating PAE roots

2021-04-08 Thread Wanpeng Li

On Fri, 5 Mar 2021 at 09:12, Sean Christopherson  wrote:
>
> Check the validity of the PDPTRs before allocating any of the PAE roots,
> otherwise a bad PDPTR will cause KVM to leak any previously allocated
> roots.
>
> Signed-off-by: Sean Christopherson 
> ---
>  arch/x86/kvm/mmu/mmu.c | 20 ++--
>  1 file changed, 14 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 7ebfbc77b050..9fc2b46f8541 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3269,7 +3269,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
>  static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
>  {
> struct kvm_mmu *mmu = vcpu->arch.mmu;
> -   u64 pdptr, pm_mask;
> +   u64 pdptrs[4], pm_mask;
> gfn_t root_gfn, root_pgd;
> hpa_t root;
> int i;
> @@ -3280,6 +3280,17 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu 
> *vcpu)
> if (mmu_check_root(vcpu, root_gfn))
> return 1;
>
> +   if (mmu->root_level == PT32E_ROOT_LEVEL) {
> +   for (i = 0; i < 4; ++i) {
> +   pdptrs[i] = mmu->get_pdptr(vcpu, i);
> +   if (!(pdptrs[i] & PT_PRESENT_MASK))
> +   continue;
> +
> +   if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT))
> +   return 1;
> +   }
> +   }
> +

I saw this splatting:

 BUG: sleeping function called from invalid context at
arch/x86/kvm/kvm_cache_regs.h:115
 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 3090, name:
qemu-system-x86
 3 locks held by qemu-system-x86/3090:
  #0: 8d499f8d00d0 (>mutex){+.+.}-{3:3}, at:
kvm_vcpu_ioctl+0x8e/0x680 [kvm]
  #1: b1b540f873e8 (>srcu){}-{0:0}, at:
vcpu_enter_guest+0xb30/0x1b10 [kvm]
  #2: b1b540f7d018 (&(kvm)->mmu_lock){+.+.}-{2:2}, at:
kvm_mmu_load+0xb5/0x540 [kvm]
 Preemption disabled at:
 [] kvm_mmu_load+0xb5/0x540 [kvm]
 CPU: 2 PID: 3090 Comm: qemu-system-x86 Tainted: GW  OE
5.12.0-rc3+ #3
 Call Trace:
  dump_stack+0x87/0xb7
  ___might_sleep+0x202/0x250
  __might_sleep+0x4a/0x80
  kvm_pdptr_read+0x20/0x60 [kvm]
  kvm_mmu_load+0x3bd/0x540 [kvm]
  vcpu_enter_guest+0x1297/0x1b10 [kvm]
  kvm_arch_vcpu_ioctl_run+0x372/0x690 [kvm]
  kvm_vcpu_ioctl+0x3ca/0x680 [kvm]
  __x64_sys_ioctl+0x27a/0x800
  do_syscall_64+0x38/0x50
  entry_SYSCALL_64_after_hwframe+0x44/0xae

There is a might_sleep() in kvm_pdptr_read(), however, the original
commit didn't explain more. I can send a formal one if the below fix
is acceptable.

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index efb41f3..f33026f 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3289,17 +3289,24 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 if (mmu_check_root(vcpu, root_gfn))
 return 1;

+write_unlock(>kvm->mmu_lock);
 if (mmu->root_level == PT32E_ROOT_LEVEL) {
 for (i = 0; i < 4; ++i) {
 pdptrs[i] = mmu->get_pdptr(vcpu, i);
 if (!(pdptrs[i] & PT_PRESENT_MASK))
 continue;

-if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT))
+if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT)) {
+write_lock(>kvm->mmu_lock);
 return 1;
+}
 }
 }

+write_lock(>kvm->mmu_lock);
+if (make_mmu_pages_available(vcpu))
+return -ENOSPC;
+
 /*
  * Do we shadow a long mode page table? If so we need to
  * write-protect the guests page table root.

Re: [PATCH v2] KVM: Explicitly use GFP_KERNEL_ACCOUNT for 'struct kvm_vcpu' allocations

2021-04-07 Thread Wanpeng Li

On Wed, 7 Apr 2021 at 03:07, Sean Christopherson  wrote:
>
> Use GFP_KERNEL_ACCOUNT when allocating vCPUs to make it more obvious that
> that the allocations are accounted, to make it easier to audit KVM's
> allocations in the future, and to be consistent with other cache usage in
> KVM.
>
> When using SLAB/SLUB, this is a nop as the cache itself is created with
> SLAB_ACCOUNT.
>
> When using SLOB, there are caveats within caveats.  SLOB doesn't honor
> SLAB_ACCOUNT, so passing GFP_KERNEL_ACCOUNT will result in vCPU
> allocations now being accounted.   But, even that depends on internal
> SLOB details as SLOB will only go to the page allocator when its cache is
> depleted.  That just happens to be extremely likely for vCPUs because the
> size of kvm_vcpu is larger than the a page for almost all combinations of
> architecture and page size.  Whether or not the SLOB behavior is by
> design is unknown; it's just as likely that no SLOB users care about
> accounding and so no one has bothered to implemented support in SLOB.
> Regardless, accounting vCPU allocations will not break SLOB+KVM+cgroup
> users, if any exist.
>
> Cc: Wanpeng Li 
> Signed-off-by: Sean Christopherson 

Reviewed-by: Wanpeng Li 

> ---
>
> v2: Drop the Fixes tag and rewrite the changelog since this is a nop when
> using SLUB or SLAB. [Wanpeng]
>
>  virt/kvm/kvm_main.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 0a481e7780f0..580f98386b42 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -3192,7 +3192,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, 
> u32 id)
> if (r)
> goto vcpu_decrement;
>
> -   vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
> +   vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
> if (!vcpu) {
> r = -ENOMEM;
> goto vcpu_decrement;
> --
> 2.31.0.208.g409f899ff0-goog
>

[PATCH] x86/kvm: Don't alloc __pv_cpu_mask when !CONFIG_SMP

2021-04-07 Thread Wanpeng Li

From: Wanpeng Li 

Enable PV TLB shootdown when !CONFIG_SMP doesn't make sense. Let's move 
it inside CONFIG_SMP. In addition, we can avoid alloc __pv_cpu_mask when 
!CONFIG_SMP and get rid of 'alloc' variable in kvm_alloc_cpumask.

Signed-off-by: Wanpeng Li 
---
 arch/x86/kernel/kvm.c | 79 +--
 1 file changed, 39 insertions(+), 40 deletions(-)

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 5e78e01..202e1f0 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -451,6 +451,11 @@ static void __init sev_map_percpu_data(void)
}
 }
 
+
+static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
+
+#ifdef CONFIG_SMP
+
 static bool pv_tlb_flush_supported(void)
 {
return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
@@ -458,10 +463,6 @@ static bool pv_tlb_flush_supported(void)
kvm_para_has_feature(KVM_FEATURE_STEAL_TIME));
 }
 
-static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
-
-#ifdef CONFIG_SMP
-
 static bool pv_ipi_supported(void)
 {
return kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI);
@@ -574,6 +575,32 @@ static void kvm_smp_send_call_func_ipi(const struct 
cpumask *mask)
}
 }
 
+static void kvm_flush_tlb_others(const struct cpumask *cpumask,
+   const struct flush_tlb_info *info)
+{
+   u8 state;
+   int cpu;
+   struct kvm_steal_time *src;
+   struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
+
+   cpumask_copy(flushmask, cpumask);
+   /*
+* We have to call flush only on online vCPUs. And
+* queue flush_on_enter for pre-empted vCPUs
+*/
+   for_each_cpu(cpu, flushmask) {
+   src = _cpu(steal_time, cpu);
+   state = READ_ONCE(src->preempted);
+   if ((state & KVM_VCPU_PREEMPTED)) {
+   if (try_cmpxchg(>preempted, ,
+   state | KVM_VCPU_FLUSH_TLB))
+   __cpumask_clear_cpu(cpu, flushmask);
+   }
+   }
+
+   native_flush_tlb_others(flushmask, info);
+}
+
 static void __init kvm_smp_prepare_boot_cpu(void)
 {
/*
@@ -611,33 +638,16 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
local_irq_enable();
return 0;
 }
-#endif
 
-static void kvm_flush_tlb_others(const struct cpumask *cpumask,
-   const struct flush_tlb_info *info)
-{
-   u8 state;
-   int cpu;
-   struct kvm_steal_time *src;
-   struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
+#else
 
-   cpumask_copy(flushmask, cpumask);
-   /*
-* We have to call flush only on online vCPUs. And
-* queue flush_on_enter for pre-empted vCPUs
-*/
-   for_each_cpu(cpu, flushmask) {
-   src = _cpu(steal_time, cpu);
-   state = READ_ONCE(src->preempted);
-   if ((state & KVM_VCPU_PREEMPTED)) {
-   if (try_cmpxchg(>preempted, ,
-   state | KVM_VCPU_FLUSH_TLB))
-   __cpumask_clear_cpu(cpu, flushmask);
-   }
-   }
+static bool pv_tlb_flush_supported(void) { return false; }
+static bool pv_ipi_supported(void) { return false; }
+static void kvm_flush_tlb_others(const struct cpumask *cpumask,
+   const struct flush_tlb_info *info) { }
+static void kvm_setup_pv_ipi(void) { }
 
-   native_flush_tlb_others(flushmask, info);
-}
+#endif
 
 static void __init kvm_guest_init(void)
 {
@@ -734,10 +744,8 @@ static uint32_t __init kvm_detect(void)
 
 static void __init kvm_apic_init(void)
 {
-#if defined(CONFIG_SMP)
if (pv_ipi_supported())
kvm_setup_pv_ipi();
-#endif
 }
 
 static bool __init kvm_msi_ext_dest_id(void)
@@ -797,20 +805,11 @@ arch_initcall(activate_jump_labels);
 static __init int kvm_alloc_cpumask(void)
 {
int cpu;
-   bool alloc = false;
 
if (!kvm_para_available() || nopv)
return 0;
 
-   if (pv_tlb_flush_supported())
-   alloc = true;
-
-#if defined(CONFIG_SMP)
-   if (pv_ipi_supported())
-   alloc = true;
-#endif
-
-   if (alloc)
+   if (pv_tlb_flush_supported() || pv_ipi_supported())
for_each_possible_cpu(cpu) {
zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, 
cpu),
GFP_KERNEL, cpu_to_node(cpu));
-- 
2.7.4

[PATCH] KVM: X86: Count success and invalid yields

2021-04-06 Thread Wanpeng Li

From: Wanpeng Li 

To analyze some performance issues with lock contention and scheduling,
it is nice to know when directed yield are successful or failing.

Signed-off-by: Wanpeng Li 
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/x86.c  | 26 --
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 44f8930..157bcaa 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1126,6 +1126,8 @@ struct kvm_vcpu_stat {
u64 halt_poll_success_ns;
u64 halt_poll_fail_ns;
u64 nested_run;
+   u64 yield_directed;
+   u64 yield_directed_ignore;
 };
 
 struct x86_instruction_info;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 16fb395..3b475cd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -246,6 +246,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
VCPU_STAT("nested_run", nested_run),
+   VCPU_STAT("yield_directed", yield_directed),
+   VCPU_STAT("yield_directed_ignore", yield_directed_ignore),
VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
VM_STAT("mmu_pte_write", mmu_pte_write),
VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
@@ -8211,21 +8213,33 @@ void kvm_apicv_init(struct kvm *kvm, bool enable)
 }
 EXPORT_SYMBOL_GPL(kvm_apicv_init);
 
-static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id)
+static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
 {
struct kvm_vcpu *target = NULL;
struct kvm_apic_map *map;
 
+   vcpu->stat.yield_directed++;
+
rcu_read_lock();
-   map = rcu_dereference(kvm->arch.apic_map);
+   map = rcu_dereference(vcpu->kvm->arch.apic_map);
 
if (likely(map) && dest_id <= map->max_apic_id && 
map->phys_map[dest_id])
target = map->phys_map[dest_id]->vcpu;
 
rcu_read_unlock();
+   if (!target)
+   goto no_yield;
+
+   if (!READ_ONCE(target->ready))
+   goto no_yield;
 
-   if (target && READ_ONCE(target->ready))
-   kvm_vcpu_yield_to(target);
+   if (kvm_vcpu_yield_to(target) <= 0)
+   goto no_yield;
+   return;
+
+no_yield:
+   vcpu->stat.yield_directed_ignore++;
+   return;
 }
 
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
@@ -8272,7 +8286,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
break;
 
kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
-   kvm_sched_yield(vcpu->kvm, a1);
+   kvm_sched_yield(vcpu, a1);
ret = 0;
break;
 #ifdef CONFIG_X86_64
@@ -8290,7 +8304,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD))
break;
 
-   kvm_sched_yield(vcpu->kvm, a0);
+   kvm_sched_yield(vcpu, a0);
ret = 0;
break;
default:
-- 
2.7.4

Re: [PATCH 1/2] KVM: Account memory allocations for 'struct kvm_vcpu'

2021-03-30 Thread Wanpeng Li

On Wed, 31 Mar 2021 at 11:24, Sean Christopherson  wrote:
>
> On Wed, Mar 31, 2021, Wanpeng Li wrote:
> > On Wed, 31 Mar 2021 at 10:32, Sean Christopherson  wrote:
> > >
> > > Use GFP_KERNEL_ACCOUNT for the vCPU allocations, the vCPUs are very much
> > > tied to a single task/VM.  For x86, the allocations were accounted up
> > > until the allocation code was moved to common KVM.  For all other
> > > architectures, vCPU allocations were never previously accounted, but only
> > > because most architectures lack accounting in general (for KVM).
> > >
> > > Fixes: e529ef66e6b5 ("KVM: Move vcpu alloc and init invocation to common 
> > > code")
> > > Signed-off-by: Sean Christopherson 
> > > ---
> > >  virt/kvm/kvm_main.c | 2 +-
> > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > >
> > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > index 383df23514b9..3884e9f30251 100644
> > > --- a/virt/kvm/kvm_main.c
> > > +++ b/virt/kvm/kvm_main.c
> > > @@ -3182,7 +3182,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm 
> > > *kvm, u32 id)
> > > if (r)
> > > goto vcpu_decrement;
> > >
> > > -   vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
> > > +   vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
> >
> > kvm_vcpu_cache is created with SLAB_ACCOUNT flag in kvm_init(), this
> > flag will guarantee further slab alloc will be charged to memcg.
> > Please refer to memcg_slab_pre_alloc_hook(). So the patch is
> > unnecessary.
>
> Hmm, I missed that.  However, AFICT only SLAB/SLUB enforce SLAB_ACCOUNT, SLOB
> does not appear to honor the flag.   The caveat to SLOB is that the
> GFP_KERNEL_ACCOUNT will only come into play when allocating new pages, and so
> allocations smaller than a page will be accounted incorrectly (I think).
> But, a vcpu is larger than a page (on x86), which means the vcpu allocation 
> will
> always be correctly accounted.
>
> I've no idea if anyone actually uses KVM+SLOB, let alone cares about 
> accounting

I asked maintainer Christoph in 2013, he told me "Well, I have never
seen non experimental systems that use SLOB. Others have claimed they
exist. It's mostly of academic interest."

> in the that case.  But, it would be nice for KVM to be consistent with the 
> other
> kmem_cache usage in KVM, all of which do double up on SLAB_ACCOUNT +
> GFP_KERNEL_ACCOUNT.
>
> Maybe rewrite the changelog and drop the Fixes?

Agreed.

Wanpeng

Re: [PATCH 1/2] KVM: Account memory allocations for 'struct kvm_vcpu'

2021-03-30 Thread Wanpeng Li

On Wed, 31 Mar 2021 at 10:32, Sean Christopherson  wrote:
>
> Use GFP_KERNEL_ACCOUNT for the vCPU allocations, the vCPUs are very much
> tied to a single task/VM.  For x86, the allocations were accounted up
> until the allocation code was moved to common KVM.  For all other
> architectures, vCPU allocations were never previously accounted, but only
> because most architectures lack accounting in general (for KVM).
>
> Fixes: e529ef66e6b5 ("KVM: Move vcpu alloc and init invocation to common 
> code")
> Signed-off-by: Sean Christopherson 
> ---
>  virt/kvm/kvm_main.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 383df23514b9..3884e9f30251 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -3182,7 +3182,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, 
> u32 id)
> if (r)
> goto vcpu_decrement;
>
> -   vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
> +   vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);

kvm_vcpu_cache is created with SLAB_ACCOUNT flag in kvm_init(), this
flag will guarantee further slab alloc will be charged to memcg.
Please refer to memcg_slab_pre_alloc_hook(). So the patch is
unnecessary.

Wanpeng

Re: [PATCH 2/2] KVM: x86: disable interrupts while pvclock_gtod_sync_lock is taken

2021-03-30 Thread Wanpeng Li

On Wed, 31 Mar 2021 at 01:01, Paolo Bonzini  wrote:
>
> pvclock_gtod_sync_lock can be taken with interrupts disabled if the
> preempt notifier calls get_kvmclock_ns to update the Xen
> runstate information:
>
>spin_lock include/linux/spinlock.h:354 [inline]
>get_kvmclock_ns+0x25/0x390 arch/x86/kvm/x86.c:2587
>kvm_xen_update_runstate+0x3d/0x2c0 arch/x86/kvm/xen.c:69
>kvm_xen_update_runstate_guest+0x74/0x320 arch/x86/kvm/xen.c:100
>kvm_xen_runstate_set_preempted arch/x86/kvm/xen.h:96 [inline]
>kvm_arch_vcpu_put+0x2d8/0x5a0 arch/x86/kvm/x86.c:4062
>
> So change the users of the spinlock to spin_lock_irqsave and
> spin_unlock_irqrestore.
>
> Reported-by: syzbot+b282b65c2c68492df...@syzkaller.appspotmail.com
> Fixes: 30b5c851af79 ("KVM: x86/xen: Add support for vCPU runstate 
> information")
> Cc: David Woodhouse 
> Cc: Marcelo Tosatti 
> Signed-off-by: Paolo Bonzini 

Reviewed-by: Wanpeng Li 

> ---
>  arch/x86/kvm/x86.c | 25 ++---
>  1 file changed, 14 insertions(+), 11 deletions(-)
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 0a83eff40b43..2bfd00da465f 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2329,7 +2329,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, 
> u64 data)
> kvm_vcpu_write_tsc_offset(vcpu, offset);
> raw_spin_unlock_irqrestore(>arch.tsc_write_lock, flags);
>
> -   spin_lock(>arch.pvclock_gtod_sync_lock);
> +   spin_lock_irqsave(>arch.pvclock_gtod_sync_lock, flags);
> if (!matched) {
> kvm->arch.nr_vcpus_matched_tsc = 0;
> } else if (!already_matched) {
> @@ -2337,7 +2337,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, 
> u64 data)
> }
>
> kvm_track_tsc_matching(vcpu);
> -   spin_unlock(>arch.pvclock_gtod_sync_lock);
> +   spin_unlock_irqrestore(>arch.pvclock_gtod_sync_lock, flags);
>  }
>
>  static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
> @@ -2559,15 +2559,16 @@ static void kvm_gen_update_masterclock(struct kvm 
> *kvm)
> int i;
> struct kvm_vcpu *vcpu;
> struct kvm_arch *ka = >arch;
> +   unsigned long flags;
>
> kvm_hv_invalidate_tsc_page(kvm);
>
> kvm_make_mclock_inprogress_request(kvm);
>
> /* no guest entries from this point */
> -   spin_lock(>pvclock_gtod_sync_lock);
> +   spin_lock_irqsave(>pvclock_gtod_sync_lock, flags);
> pvclock_update_vm_gtod_copy(kvm);
> -   spin_unlock(>pvclock_gtod_sync_lock);
> +   spin_unlock_irqrestore(>pvclock_gtod_sync_lock, flags);
>
> kvm_for_each_vcpu(i, vcpu, kvm)
> kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
> @@ -2582,17 +2583,18 @@ u64 get_kvmclock_ns(struct kvm *kvm)
>  {
> struct kvm_arch *ka = >arch;
> struct pvclock_vcpu_time_info hv_clock;
> +   unsigned long flags;
> u64 ret;
>
> -   spin_lock(>pvclock_gtod_sync_lock);
> +   spin_lock_irqsave(>pvclock_gtod_sync_lock, flags);
> if (!ka->use_master_clock) {
> -   spin_unlock(>pvclock_gtod_sync_lock);
> +   spin_unlock_irqrestore(>pvclock_gtod_sync_lock, flags);
> return get_kvmclock_base_ns() + ka->kvmclock_offset;
> }
>
> hv_clock.tsc_timestamp = ka->master_cycle_now;
> hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
> -   spin_unlock(>pvclock_gtod_sync_lock);
> +   spin_unlock_irqrestore(>pvclock_gtod_sync_lock, flags);
>
> /* both __this_cpu_read() and rdtsc() should be on the same cpu */
> get_cpu();
> @@ -2686,13 +2688,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
>  * If the host uses TSC clock, then passthrough TSC as stable
>  * to the guest.
>  */
> -   spin_lock(>pvclock_gtod_sync_lock);
> +   spin_lock_irqsave(>pvclock_gtod_sync_lock, flags);
> use_master_clock = ka->use_master_clock;
> if (use_master_clock) {
> host_tsc = ka->master_cycle_now;
> kernel_ns = ka->master_kernel_ns;
> }
> -   spin_unlock(>pvclock_gtod_sync_lock);
> +   spin_unlock_irqrestore(>pvclock_gtod_sync_lock, flags);
>
> /* Keep irq disabled to prevent changes to the clock */
> local_irq_save(flags);
> @@ -7724,6 +7726,7 @@ static void kvm_hyperv_tsc_notifier(void)
> struct kvm *kvm;
> struct kvm_vcpu *vcpu;
> int cpu;
> +   unsigned long flags;
>
>

Re: [PATCH 1/2] KVM: x86: reduce pvclock_gtod_sync_lock critical sections

2021-03-30 Thread Wanpeng Li

On Wed, 31 Mar 2021 at 01:02, Paolo Bonzini  wrote:
>
> There is no need to include changes to vcpu->requests into
> the pvclock_gtod_sync_lock critical section.  The changes to
> the shared data structures (in pvclock_update_vm_gtod_copy)
> already occur under the lock.
>
> Cc: David Woodhouse 
> Cc: Marcelo Tosatti 
> Signed-off-by: Paolo Bonzini 

Reviewed-by: Wanpeng Li 

> ---
>  arch/x86/kvm/x86.c | 10 --
>  1 file changed, 4 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index fe806e894212..0a83eff40b43 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2562,10 +2562,12 @@ static void kvm_gen_update_masterclock(struct kvm 
> *kvm)
>
> kvm_hv_invalidate_tsc_page(kvm);
>
> -   spin_lock(>pvclock_gtod_sync_lock);
> kvm_make_mclock_inprogress_request(kvm);
> +
> /* no guest entries from this point */
> +   spin_lock(>pvclock_gtod_sync_lock);
> pvclock_update_vm_gtod_copy(kvm);
> +   spin_unlock(>pvclock_gtod_sync_lock);
>
> kvm_for_each_vcpu(i, vcpu, kvm)
> kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
> @@ -2573,8 +2575,6 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
> /* guest entries allowed */
> kvm_for_each_vcpu(i, vcpu, kvm)
> kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
> -
> -   spin_unlock(>pvclock_gtod_sync_lock);
>  #endif
>  }
>
> @@ -7740,16 +7740,14 @@ static void kvm_hyperv_tsc_notifier(void)
> struct kvm_arch *ka = >arch;
>
> spin_lock(>pvclock_gtod_sync_lock);
> -
> pvclock_update_vm_gtod_copy(kvm);
> +   spin_unlock(>pvclock_gtod_sync_lock);
>
> kvm_for_each_vcpu(cpu, vcpu, kvm)
> kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
>
> kvm_for_each_vcpu(cpu, vcpu, kvm)
> kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
> -
> -   spin_unlock(>pvclock_gtod_sync_lock);
> }
> mutex_unlock(_lock);
>  }
> --
> 2.26.2
>
>

Re: [PATCH] KVM: X86: Properly account for guest CPU time when considering context tracking

2021-03-29 Thread Wanpeng Li

On Tue, 30 Mar 2021 at 01:15, Sean Christopherson  wrote:
>
> +Thomas
>
> On Mon, Mar 29, 2021, Wanpeng Li wrote:
> > From: Wanpeng Li 
> >
> > The bugzilla https://bugzilla.kernel.org/show_bug.cgi?id=209831
> > reported that the guest time remains 0 when running a while true
> > loop in the guest.
> >
> > The commit 87fa7f3e98a131 ("x86/kvm: Move context tracking where it
> > belongs") moves guest_exit_irqoff() close to vmexit breaks the
> > tick-based time accouting when the ticks that happen after IRQs are
> > disabled are incorrectly accounted to the host/system time. This is
> > because we exit the guest state too early.
> >
> > vtime-based time accounting is tied to context tracking, keep the
> > guest_exit_irqoff() around vmexit code when both vtime-based time
> > accounting and specific cpu is context tracking mode active.
> > Otherwise, leave guest_exit_irqoff() after handle_exit_irqoff()
> > and explicit IRQ window for tick-based time accouting.
> >
> > Fixes: 87fa7f3e98a131 ("x86/kvm: Move context tracking where it belongs")
> > Cc: Sean Christopherson 
> > Signed-off-by: Wanpeng Li 
> > ---
> >  arch/x86/kvm/svm/svm.c | 3 ++-
> >  arch/x86/kvm/vmx/vmx.c | 3 ++-
> >  arch/x86/kvm/x86.c | 2 ++
> >  3 files changed, 6 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> > index 58a45bb..55fb5ce 100644
> > --- a/arch/x86/kvm/svm/svm.c
> > +++ b/arch/x86/kvm/svm/svm.c
> > @@ -3812,7 +3812,8 @@ static noinstr void svm_vcpu_enter_exit(struct 
> > kvm_vcpu *vcpu,
> >* into world and some more.
> >*/
> >   lockdep_hardirqs_off(CALLER_ADDR0);
> > - guest_exit_irqoff();
> > + if (vtime_accounting_enabled_this_cpu())
> > + guest_exit_irqoff();
> >
> >   instrumentation_begin();
> >   trace_hardirqs_off_finish();
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > index 32cf828..85695b3 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -6689,7 +6689,8 @@ static noinstr void vmx_vcpu_enter_exit(struct 
> > kvm_vcpu *vcpu,
> >* into world and some more.
> >*/
> >   lockdep_hardirqs_off(CALLER_ADDR0);
> > - guest_exit_irqoff();
> > + if (vtime_accounting_enabled_this_cpu())
> > + guest_exit_irqoff();
>
> This looks ok, as CONFIG_CONTEXT_TRACKING and CONFIG_VIRT_CPU_ACCOUNTING_GEN 
> are
> selected by CONFIG_NO_HZ_FULL=y, and can't be enabled independently, e.g. the
> rcu_user_exit() call won't be delayed because it will never be called in the
> !vtime case.  But it still feels wrong poking into those details, e.g. it'll
> be weird and/or wrong guest_exit_irqoff() gains stuff that isn't vtime 
> specific.

Could you elaborate what's the meaning of "it'll be weird and/or wrong
guest_exit_irqoff() gains stuff that isn't vtime specific."?

Wanpeng

[PATCH] KVM: X86: Properly account for guest CPU time when considering context tracking

2021-03-29 Thread Wanpeng Li

From: Wanpeng Li 

The bugzilla https://bugzilla.kernel.org/show_bug.cgi?id=209831 
reported that the guest time remains 0 when running a while true 
loop in the guest.

The commit 87fa7f3e98a131 ("x86/kvm: Move context tracking where it 
belongs") moves guest_exit_irqoff() close to vmexit breaks the 
tick-based time accouting when the ticks that happen after IRQs are 
disabled are incorrectly accounted to the host/system time. This is 
because we exit the guest state too early.

vtime-based time accounting is tied to context tracking, keep the 
guest_exit_irqoff() around vmexit code when both vtime-based time 
accounting and specific cpu is context tracking mode active. 
Otherwise, leave guest_exit_irqoff() after handle_exit_irqoff() 
and explicit IRQ window for tick-based time accouting.

Fixes: 87fa7f3e98a131 ("x86/kvm: Move context tracking where it belongs")
Cc: Sean Christopherson 
Signed-off-by: Wanpeng Li 
---
 arch/x86/kvm/svm/svm.c | 3 ++-
 arch/x86/kvm/vmx/vmx.c | 3 ++-
 arch/x86/kvm/x86.c | 2 ++
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 58a45bb..55fb5ce 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3812,7 +3812,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu 
*vcpu,
 * into world and some more.
 */
lockdep_hardirqs_off(CALLER_ADDR0);
-   guest_exit_irqoff();
+   if (vtime_accounting_enabled_this_cpu())
+   guest_exit_irqoff();
 
instrumentation_begin();
trace_hardirqs_off_finish();
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 32cf828..85695b3 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6689,7 +6689,8 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu 
*vcpu,
 * into world and some more.
 */
lockdep_hardirqs_off(CALLER_ADDR0);
-   guest_exit_irqoff();
+   if (vtime_accounting_enabled_this_cpu())
+   guest_exit_irqoff();
 
instrumentation_begin();
trace_hardirqs_off_finish();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fe806e8..234c8b3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9185,6 +9185,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
++vcpu->stat.exits;
local_irq_disable();
kvm_after_interrupt(vcpu);
+   if (!vtime_accounting_enabled_this_cpu())
+   guest_exit_irqoff();
 
if (lapic_in_kernel(vcpu)) {
s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
-- 
2.7.4

Re: [syzbot] possible deadlock in scheduler_tick

2021-03-24 Thread Wanpeng Li

Cc David Woodhouse,
On Wed, 24 Mar 2021 at 18:11, syzbot
 wrote:
>
> Hello,
>
> syzbot found the following issue on:
>
> HEAD commit:1c273e10 Merge tag 'zonefs-5.12-rc4' of git://git.kernel.o..
> git tree:   upstream
> console output: https://syzkaller.appspot.com/x/log.txt?x=13c0414ed0
> kernel config:  https://syzkaller.appspot.com/x/.config?x=6abda3336c698a07
> dashboard link: https://syzkaller.appspot.com/bug?extid=b282b65c2c68492df769
> userspace arch: i386
> syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=17d86ad6d0
> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=17b8497cd0
>
> IMPORTANT: if you fix the issue, please add the following tag to the commit:
> Reported-by: syzbot+b282b65c2c68492df...@syzkaller.appspotmail.com
>
> =
> WARNING: HARDIRQ-safe -> HARDIRQ-unsafe lock order detected
> 5.12.0-rc3-syzkaller #0 Not tainted
> -
> syz-executor030/8435 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire:
> c90001a2a230 (>arch.pvclock_gtod_sync_lock){+.+.}-{2:2}, at: 
> spin_lock include/linux/spinlock.h:354 [inline]
> c90001a2a230 (>arch.pvclock_gtod_sync_lock){+.+.}-{2:2}, at: 
> get_kvmclock_ns+0x25/0x390 arch/x86/kvm/x86.c:2587
>
> and this task is already holding:
> 8880b9d35198 (>lock){-.-.}-{2:2}, at: rq_lock 
> kernel/sched/sched.h:1321 [inline]
> 8880b9d35198 (>lock){-.-.}-{2:2}, at: __schedule+0x21c/0x21b0 
> kernel/sched/core.c:4990
> which would create a new lock dependency:
>  (>lock){-.-.}-{2:2} -> (>arch.pvclock_gtod_sync_lock){+.+.}-{2:2}
>
> but this new dependency connects a HARDIRQ-irq-safe lock:
>  (>lock){-.-.}-{2:2}
>
> ... which became HARDIRQ-irq-safe at:
>   lock_acquire kernel/locking/lockdep.c:5510 [inline]
>   lock_acquire+0x1ab/0x740 kernel/locking/lockdep.c:5475
>   __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline]
>   _raw_spin_lock+0x2a/0x40 kernel/locking/spinlock.c:151
>   rq_lock kernel/sched/sched.h:1321 [inline]
>   scheduler_tick+0xa4/0x4b0 kernel/sched/core.c:4538
>   update_process_times+0x191/0x200 kernel/time/timer.c:1801
>   tick_periodic+0x79/0x230 kernel/time/tick-common.c:100
>   tick_handle_periodic+0x41/0x120 kernel/time/tick-common.c:112
>   timer_interrupt+0x3f/0x60 arch/x86/kernel/time.c:57
>   __handle_irq_event_percpu+0x303/0x8f0 kernel/irq/handle.c:156
>   handle_irq_event_percpu kernel/irq/handle.c:196 [inline]
>   handle_irq_event+0x102/0x290 kernel/irq/handle.c:213
>   handle_level_irq+0x256/0x6e0 kernel/irq/chip.c:650
>   generic_handle_irq_desc include/linux/irqdesc.h:158 [inline]
>   handle_irq arch/x86/kernel/irq.c:231 [inline]
>   __common_interrupt+0x9e/0x200 arch/x86/kernel/irq.c:250
>   common_interrupt+0x9f/0xd0 arch/x86/kernel/irq.c:240
>   asm_common_interrupt+0x1e/0x40 arch/x86/include/asm/idtentry.h:623
>   __raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:161 [inline]
>   _raw_spin_unlock_irqrestore+0x38/0x70 kernel/locking/spinlock.c:191
>   __setup_irq+0xc72/0x1ce0 kernel/irq/manage.c:1737
>   request_threaded_irq+0x28a/0x3b0 kernel/irq/manage.c:2127
>   request_irq include/linux/interrupt.h:160 [inline]
>   setup_default_timer_irq arch/x86/kernel/time.c:70 [inline]
>   hpet_time_init+0x28/0x42 arch/x86/kernel/time.c:82
>   x86_late_time_init+0x58/0x94 arch/x86/kernel/time.c:94
>   start_kernel+0x3ee/0x496 init/main.c:1028
>   secondary_startup_64_no_verify+0xb0/0xbb
>
> to a HARDIRQ-irq-unsafe lock:
>  (>arch.pvclock_gtod_sync_lock){+.+.}-{2:2}
>
> ... which became HARDIRQ-irq-unsafe at:
> ...
>   lock_acquire kernel/locking/lockdep.c:5510 [inline]
>   lock_acquire+0x1ab/0x740 kernel/locking/lockdep.c:5475
>   __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline]
>   _raw_spin_lock+0x2a/0x40 kernel/locking/spinlock.c:151
>   spin_lock include/linux/spinlock.h:354 [inline]
>   kvm_synchronize_tsc+0x459/0x1230 arch/x86/kvm/x86.c:2332
>   kvm_arch_vcpu_postcreate+0x73/0x180 arch/x86/kvm/x86.c:10183
>   kvm_vm_ioctl_create_vcpu arch/x86/kvm/../../../virt/kvm/kvm_main.c:3239 
> [inline]
>   kvm_vm_ioctl+0x1b2d/0x2800 arch/x86/kvm/../../../virt/kvm/kvm_main.c:3839
>   kvm_vm_compat_ioctl+0x125/0x230 
> arch/x86/kvm/../../../virt/kvm/kvm_main.c:4052
>   __do_compat_sys_ioctl+0x1d3/0x230 fs/ioctl.c:842
>   do_syscall_32_irqs_on arch/x86/entry/common.c:77 [inline]
>   __do_fast_syscall_32+0x56/0x90 arch/x86/entry/common.c:140
>   do_fast_syscall_32+0x2f/0x70 arch/x86/entry/common.c:165
>   entry_SYSENTER_compat_after_hwframe+0x4d/0x5c
>
> other info that might help us debug this:
>
>  Possible interrupt unsafe locking scenario:
>
>CPU0CPU1
>
>   lock(>arch.pvclock_gtod_sync_lock);
>local_irq_disable();
>lock(>lock);
>lock(>arch.pvclock_gtod_sync_lock);
>   
> lock(>lock);
>

The offender is

Re: [PATCH] KVM: arm: memcg awareness

2021-03-17 Thread Wanpeng Li

On Wed, 17 Mar 2021 at 16:04, Wanpeng Li  wrote:
>
> On Wed, 17 Mar 2021 at 15:57, Michal Hocko  wrote:
> >
> > On Wed 17-03-21 13:46:24, Wanpeng Li wrote:
> > > From: Wanpeng Li 
> > >
> > > KVM allocations in the arm kvm code which are tied to the life
> > > of the VM process should be charged to the VM process's cgroup.
> >
> > How much memory are we talking about?
> >
> > > This will help the memcg controler to do the right decisions.
> >
> > This is a bit vague. What is the right decision? AFAICS none of that
> > memory is considered during oom victim selection. The only thing memcg
> > controler can help with is to contain and account this additional
> > memory. This might help to better isolate multiple workloads on the same
> > system. Maybe this is what you wanted to say? Or maybe this is a way to
> > prevent untrusted users from consuming a lot of memory?
>

https://patchwork.kernel.org/project/kvm/patch/20190211190252.198101-1-bgar...@google.com/

> It is explained in this patchset for x86 kvm which is upstream, I
> think I don't need to copy and paste. :)
>
> Wanpeng

Re: [PATCH] KVM: arm: memcg awareness

2021-03-17 Thread Wanpeng Li

On Wed, 17 Mar 2021 at 15:57, Michal Hocko  wrote:
>
> On Wed 17-03-21 13:46:24, Wanpeng Li wrote:
> > From: Wanpeng Li 
> >
> > KVM allocations in the arm kvm code which are tied to the life
> > of the VM process should be charged to the VM process's cgroup.
>
> How much memory are we talking about?
>
> > This will help the memcg controler to do the right decisions.
>
> This is a bit vague. What is the right decision? AFAICS none of that
> memory is considered during oom victim selection. The only thing memcg
> controler can help with is to contain and account this additional
> memory. This might help to better isolate multiple workloads on the same
> system. Maybe this is what you wanted to say? Or maybe this is a way to
> prevent untrusted users from consuming a lot of memory?

It is explained in this patchset for x86 kvm which is upstream, I
think I don't need to copy and paste. :)

Wanpeng

[PATCH] KVM: arm: memcg awareness

2021-03-16 Thread Wanpeng Li

From: Wanpeng Li 

KVM allocations in the arm kvm code which are tied to the life 
of the VM process should be charged to the VM process's cgroup.
This will help the memcg controler to do the right decisions.

Signed-off-by: Wanpeng Li 
---
 arch/arm64/kvm/arm.c   |  5 +++--
 arch/arm64/kvm/hyp/pgtable.c   |  4 ++--
 arch/arm64/kvm/mmu.c   |  4 ++--
 arch/arm64/kvm/pmu-emul.c  |  2 +-
 arch/arm64/kvm/reset.c |  2 +-
 arch/arm64/kvm/vgic/vgic-debug.c   |  2 +-
 arch/arm64/kvm/vgic/vgic-init.c|  2 +-
 arch/arm64/kvm/vgic/vgic-irqfd.c   |  2 +-
 arch/arm64/kvm/vgic/vgic-its.c | 14 +++---
 arch/arm64/kvm/vgic/vgic-mmio-v3.c |  2 +-
 arch/arm64/kvm/vgic/vgic-v4.c  |  2 +-
 11 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 7f06ba7..8040874 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -278,9 +278,10 @@ long kvm_arch_dev_ioctl(struct file *filp,
 struct kvm *kvm_arch_alloc_vm(void)
 {
if (!has_vhe())
-   return kzalloc(sizeof(struct kvm), GFP_KERNEL);
+   return kzalloc(sizeof(struct kvm), GFP_KERNEL_ACCOUNT);
 
-   return vzalloc(sizeof(struct kvm));
+   return __vmalloc(sizeof(struct kvm),
+   GFP_KERNEL_ACCOUNT | __GFP_ZERO);
 }
 
 void kvm_arch_free_vm(struct kvm *kvm)
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 926fc07..a0845d3 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -366,7 +366,7 @@ static int hyp_map_walker(u64 addr, u64 end, u32 level, 
kvm_pte_t *ptep,
if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
return -EINVAL;
 
-   childp = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL);
+   childp = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!childp)
return -ENOMEM;
 
@@ -401,7 +401,7 @@ int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 
va_bits)
 {
u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits);
 
-   pgt->pgd = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL);
+   pgt->pgd = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!pgt->pgd)
return -ENOMEM;
 
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 8711894..8c9dc49 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -370,7 +370,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu 
*mmu)
return -EINVAL;
}
 
-   pgt = kzalloc(sizeof(*pgt), GFP_KERNEL);
+   pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT);
if (!pgt)
return -ENOMEM;
 
@@ -1244,7 +1244,7 @@ int kvm_mmu_init(void)
goto out;
}
 
-   hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL);
+   hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL_ACCOUNT);
if (!hyp_pgtable) {
kvm_err("Hyp mode page-table not allocated\n");
err = -ENOMEM;
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index e32c6e1..00cf750 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -967,7 +967,7 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct 
kvm_device_attr *attr)
mutex_lock(>kvm->lock);
 
if (!vcpu->kvm->arch.pmu_filter) {
-   vcpu->kvm->arch.pmu_filter = bitmap_alloc(nr_events, 
GFP_KERNEL);
+   vcpu->kvm->arch.pmu_filter = bitmap_alloc(nr_events, 
GFP_KERNEL_ACCOUNT);
if (!vcpu->kvm->arch.pmu_filter) {
mutex_unlock(>kvm->lock);
return -ENOMEM;
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index bd354cd..3cbcf6b 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -110,7 +110,7 @@ static int kvm_vcpu_finalize_sve(struct kvm_vcpu *vcpu)
vl > SVE_VL_ARCH_MAX))
return -EIO;
 
-   buf = kzalloc(SVE_SIG_REGS_SIZE(sve_vq_from_vl(vl)), GFP_KERNEL);
+   buf = kzalloc(SVE_SIG_REGS_SIZE(sve_vq_from_vl(vl)), 
GFP_KERNEL_ACCOUNT);
if (!buf)
return -ENOMEM;
 
diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c
index f38c40a..e6a01f2 100644
--- a/arch/arm64/kvm/vgic/vgic-debug.c
+++ b/arch/arm64/kvm/vgic/vgic-debug.c
@@ -92,7 +92,7 @@ static void *vgic_debug_start(struct seq_file *s, loff_t *pos)
goto out;
}
 
-   iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+   iter = kmalloc(sizeof(*iter), GFP_KERNEL_ACCOUNT);
if (!iter) {
iter = ERR_PTR(-ENOMEM);
goto out;
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index 052917d..27d1513 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch

Re: [PATCH] x86/kvm: Fix broken irq restoration in kvm_wait

2021-03-15 Thread Wanpeng Li

On Sat, 13 Mar 2021 at 17:33, Paolo Bonzini  wrote:
>
> On 13/03/21 01:57, Wanpeng Li wrote:
> >> A third option would be to split the paths.  In the end, it's only the 
> >> ptr/val
> >> line that's shared.
> > I just sent out a formal patch for my alternative fix, I think the
> > whole logic in kvm_wait is more clear w/ my version.
> >
>
> I don't know, having three "if"s in 10 lines of code is a bit daunting.

Fair enough, just sent out v3 per Sean's suggestion.

Wanpeng

[PATCH v3] x86/kvm: Fix broken irq restoration in kvm_wait

2021-03-15 Thread Wanpeng Li

From: Wanpeng Li 

After commit 997acaf6b4b59c (lockdep: report broken irq restoration), the guest 
splatting below during boot:

 raw_local_irq_restore() called with IRQs enabled
 WARNING: CPU: 1 PID: 169 at kernel/locking/irqflag-debug.c:10 
warn_bogus_irq_restore+0x26/0x30
 Modules linked in: hid_generic usbhid hid
 CPU: 1 PID: 169 Comm: systemd-udevd Not tainted 5.11.0+ #25
 RIP: 0010:warn_bogus_irq_restore+0x26/0x30
 Call Trace:
  kvm_wait+0x76/0x90
  __pv_queued_spin_lock_slowpath+0x285/0x2e0
  do_raw_spin_lock+0xc9/0xd0
  _raw_spin_lock+0x59/0x70
  lockref_get_not_dead+0xf/0x50
  __legitimize_path+0x31/0x60
  legitimize_root+0x37/0x50
  try_to_unlazy_next+0x7f/0x1d0
  lookup_fast+0xb0/0x170
  path_openat+0x165/0x9b0
  do_filp_open+0x99/0x110
  do_sys_openat2+0x1f1/0x2e0
  do_sys_open+0x5c/0x80
  __x64_sys_open+0x21/0x30
  do_syscall_64+0x32/0x50
  entry_SYSCALL_64_after_hwframe+0x44/0xae

The irqflags handling in kvm_wait() which ends up doing:

local_irq_save(flags);
safe_halt();
local_irq_restore(flags);

which triggered a new consistency checking, we generally expect 
local_irq_save() and local_irq_restore() to be pared and sanely 
nested, and so local_irq_restore() expects to be called with 
irqs disabled. 

This patch fixes it by playing local_irq_disable()/enable() directly.

Cc: Mark Rutland 
Cc: Thomas Gleixner 
Signed-off-by: Wanpeng Li 
---
v2 -> v3:
 * per Sean's suggestion

 arch/x86/kernel/kvm.c | 23 ++-
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 5e78e01..72dbb74 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -836,28 +836,25 @@ static void kvm_kick_cpu(int cpu)
 
 static void kvm_wait(u8 *ptr, u8 val)
 {
-   unsigned long flags;
-
if (in_nmi())
return;
 
-   local_irq_save(flags);
-
-   if (READ_ONCE(*ptr) != val)
-   goto out;
-
/*
 * halt until it's our turn and kicked. Note that we do safe halt
 * for irq enabled case to avoid hang when lock info is overwritten
 * in irq spinlock slowpath and no spurious interrupt occur to save us.
 */
-   if (arch_irqs_disabled_flags(flags))
-   halt();
-   else
-   safe_halt();
+   if (irqs_disabled()) {
+   if (READ_ONCE(*ptr) == val)
+   halt();
+   } else {
+   local_irq_disable();
 
-out:
-   local_irq_restore(flags);
+   if (READ_ONCE(*ptr) == val)
+   safe_halt();
+
+   local_irq_enable();
+   }
 }
 
 #ifdef CONFIG_X86_32
-- 
2.7.4

Re: [PATCH] x86/kvm: Fix broken irq restoration in kvm_wait

2021-03-12 Thread Wanpeng Li

On Thu, 11 Mar 2021 at 23:54, Sean Christopherson  wrote:
>
> On Tue, Feb 23, 2021, Wanpeng Li wrote:
> > On Tue, 23 Feb 2021 at 13:25, Wanpeng Li  wrote:
> > >
> > > From: Wanpeng Li 
> > >
> > > After commit 997acaf6b4b59c (lockdep: report broken irq restoration), the 
> > > guest
> > > splatting below during boot:
> > >
> > >  raw_local_irq_restore() called with IRQs enabled
> > >  WARNING: CPU: 1 PID: 169 at kernel/locking/irqflag-debug.c:10 
> > > warn_bogus_irq_restore+0x26/0x30
> > >  Modules linked in: hid_generic usbhid hid
> > >  CPU: 1 PID: 169 Comm: systemd-udevd Not tainted 5.11.0+ #25
> > >  RIP: 0010:warn_bogus_irq_restore+0x26/0x30
> > >  Call Trace:
> > >   kvm_wait+0x76/0x90
> > >   __pv_queued_spin_lock_slowpath+0x285/0x2e0
> > >   do_raw_spin_lock+0xc9/0xd0
> > >   _raw_spin_lock+0x59/0x70
> > >   lockref_get_not_dead+0xf/0x50
> > >   __legitimize_path+0x31/0x60
> > >   legitimize_root+0x37/0x50
> > >   try_to_unlazy_next+0x7f/0x1d0
> > >   lookup_fast+0xb0/0x170
> > >   path_openat+0x165/0x9b0
> > >   do_filp_open+0x99/0x110
> > >   do_sys_openat2+0x1f1/0x2e0
> > >   do_sys_open+0x5c/0x80
> > >   __x64_sys_open+0x21/0x30
> > >   do_syscall_64+0x32/0x50
> > >   entry_SYSCALL_64_after_hwframe+0x44/0xae
> > >
> > > The irqflags handling in kvm_wait() which ends up doing:
> > >
> > > local_irq_save(flags);
> > > safe_halt();
> > > local_irq_restore(flags);
> > >
> > > which triggered a new consistency checking, we generally expect
> > > local_irq_save() and local_irq_restore() to be pared and sanely
> > > nested, and so local_irq_restore() expects to be called with
> > > irqs disabled.
> > >
> > > This patch fixes it by adding a local_irq_disable() after safe_halt()
> > > to avoid this warning.
> > >
> > > Cc: Mark Rutland 
> > > Cc: Thomas Gleixner 
> > > Signed-off-by: Wanpeng Li 
> > > ---
> > >  arch/x86/kernel/kvm.c | 4 +++-
> > >  1 file changed, 3 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> > > index 5e78e01..688c84a 100644
> > > --- a/arch/x86/kernel/kvm.c
> > > +++ b/arch/x86/kernel/kvm.c
> > > @@ -853,8 +853,10 @@ static void kvm_wait(u8 *ptr, u8 val)
> > >  */
> > > if (arch_irqs_disabled_flags(flags))
> > > halt();
> > > -   else
> > > +   else {
> > > safe_halt();
> > > +   local_irq_disable();
> > > +   }
> >
> > An alternative fix:
> >
> > diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> > index 5e78e01..7127aef 100644
> > --- a/arch/x86/kernel/kvm.c
> > +++ b/arch/x86/kernel/kvm.c
> > @@ -836,12 +836,13 @@ static void kvm_kick_cpu(int cpu)
> >
> >  static void kvm_wait(u8 *ptr, u8 val)
> >  {
> > -unsigned long flags;
> > +bool disabled = irqs_disabled();
> >
> >  if (in_nmi())
> >  return;
> >
> > -local_irq_save(flags);
> > +if (!disabled)
> > +local_irq_disable();
> >
> >  if (READ_ONCE(*ptr) != val)
> >  goto out;
> > @@ -851,13 +852,14 @@ static void kvm_wait(u8 *ptr, u8 val)
> >   * for irq enabled case to avoid hang when lock info is overwritten
> >   * in irq spinlock slowpath and no spurious interrupt occur to save us.
> >   */
> > -if (arch_irqs_disabled_flags(flags))
> > +if (disabled)
> >  halt();
> >  else
> >  safe_halt();
> >
> >  out:
> > -local_irq_restore(flags);
> > +if (!disabled)
> > +local_irq_enable();
> >  }
> >
> >  #ifdef CONFIG_X86_32
>
> A third option would be to split the paths.  In the end, it's only the ptr/val
> line that's shared.

I just sent out a formal patch for my alternative fix, I think the
whole logic in kvm_wait is more clear w/ my version.

>
> ---
>  arch/x86/kernel/kvm.c | 23 ++-
>  1 file changed, 10 insertions(+), 13 deletions(-)
>
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index 5e78e01ca3b4..78bb0fae3982 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -836,28 +836,25 @@ static void kvm_kick_cpu(int cpu)
>
>  s

[PATCH v2] x86/kvm: Fix broken irq restoration in kvm_wait

2021-03-12 Thread Wanpeng Li

From: Wanpeng Li 

After commit 997acaf6b4b59c (lockdep: report broken irq restoration), the guest 
splatting below during boot:

 raw_local_irq_restore() called with IRQs enabled
 WARNING: CPU: 1 PID: 169 at kernel/locking/irqflag-debug.c:10 
warn_bogus_irq_restore+0x26/0x30
 Modules linked in: hid_generic usbhid hid
 CPU: 1 PID: 169 Comm: systemd-udevd Not tainted 5.11.0+ #25
 RIP: 0010:warn_bogus_irq_restore+0x26/0x30
 Call Trace:
  kvm_wait+0x76/0x90
  __pv_queued_spin_lock_slowpath+0x285/0x2e0
  do_raw_spin_lock+0xc9/0xd0
  _raw_spin_lock+0x59/0x70
  lockref_get_not_dead+0xf/0x50
  __legitimize_path+0x31/0x60
  legitimize_root+0x37/0x50
  try_to_unlazy_next+0x7f/0x1d0
  lookup_fast+0xb0/0x170
  path_openat+0x165/0x9b0
  do_filp_open+0x99/0x110
  do_sys_openat2+0x1f1/0x2e0
  do_sys_open+0x5c/0x80
  __x64_sys_open+0x21/0x30
  do_syscall_64+0x32/0x50
  entry_SYSCALL_64_after_hwframe+0x44/0xae

The irqflags handling in kvm_wait() which ends up doing:

local_irq_save(flags);
safe_halt();
local_irq_restore(flags);

which triggered a new consistency checking, we generally expect 
local_irq_save() and local_irq_restore() to be pared and sanely 
nested, and so local_irq_restore() expects to be called with 
irqs disabled. 

This patch fixes it by playing local_irq_disable()/enable() directly.

Cc: Mark Rutland 
Cc: Thomas Gleixner 
Signed-off-by: Wanpeng Li 
---
v1 -> v2:
 * select the alternative fix

 arch/x86/kernel/kvm.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 5e78e01..7127aef 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -836,12 +836,13 @@ static void kvm_kick_cpu(int cpu)
 
 static void kvm_wait(u8 *ptr, u8 val)
 {
-   unsigned long flags;
+   bool disabled = irqs_disabled();
 
if (in_nmi())
return;
 
-   local_irq_save(flags);
+   if (!disabled)
+   local_irq_disable();
 
if (READ_ONCE(*ptr) != val)
goto out;
@@ -851,13 +852,14 @@ static void kvm_wait(u8 *ptr, u8 val)
 * for irq enabled case to avoid hang when lock info is overwritten
 * in irq spinlock slowpath and no spurious interrupt occur to save us.
 */
-   if (arch_irqs_disabled_flags(flags))
+   if (disabled)
halt();
else
safe_halt();
 
 out:
-   local_irq_restore(flags);
+   if (!disabled)
+   local_irq_enable();
 }
 
 #ifdef CONFIG_X86_32
-- 
2.7.4

[PATCH] KVM: X86: Fix missing local pCPU when executing wbinvd on all dirty pCPUs

2021-03-11 Thread Wanpeng Li

From: Wanpeng Li 

We should execute wbinvd on all dirty pCPUs when guest wbinvd exits 
to maintain datat consistency in order to deal with noncoherent DMA.
smp_call_function_many() does not execute the provided function on 
the local core, this patch replaces it by on_each_cpu_mask().

Reported-by: Nadav Amit 
Cc: Nadav Amit 
Signed-off-by: Wanpeng Li 
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 012d5df..aa6d667 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6638,7 +6638,7 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu 
*vcpu)
int cpu = get_cpu();
 
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
-   smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
+   on_each_cpu_mask(vcpu->arch.wbinvd_dirty_mask,
wbinvd_ipi, NULL, 1);
put_cpu();
cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
-- 
2.7.4

Re: [PATCH] KVM: x86: Ensure deadline timer has truly expired before posting its IRQ

2021-03-11 Thread Wanpeng Li

On Fri, 5 Mar 2021 at 10:19, Sean Christopherson  wrote:
>
> When posting a deadline timer interrupt, open code the checks guarding
> __kvm_wait_lapic_expire() in order to skip the lapic_timer_int_injected()
> check in kvm_wait_lapic_expire().  The injection check will always fail
> since the interrupt has not yet be injected.  Moving the call after
> injection would also be wrong as that wouldn't actually delay delivery
> of the IRQ if it is indeed sent via posted interrupt.
>
> Fixes: 010fd37fddf6 ("KVM: LAPIC: Reduce world switch latency caused by 
> timer_advance_ns")
> Cc: sta...@vger.kernel.org
> Signed-off-by: Sean Christopherson 
> ---
>  arch/x86/kvm/lapic.c | 11 ++-
>  1 file changed, 10 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 45d40bfacb7c..cb8ebfaccfb6 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -1642,7 +1642,16 @@ static void apic_timer_expired(struct kvm_lapic *apic, 
> bool from_timer_fn)
> }
>
> if (kvm_use_posted_timer_interrupt(apic->vcpu)) {
> -   kvm_wait_lapic_expire(vcpu);
> +   /*
> +* Ensure the guest's timer has truly expired before posting 
> an
> +* interrupt.  Open code the relevant checks to avoid querying
> +* lapic_timer_int_injected(), which will be false since the
> +* interrupt isn't yet injected.  Waiting until after 
> injecting
> +* is not an option since that won't help a posted interrupt.
> +*/
> +   if (vcpu->arch.apic->lapic_timer.expired_tscdeadline &&
> +   vcpu->arch.apic->lapic_timer.timer_advance_ns)
> +   __kvm_wait_lapic_expire(vcpu);

Thanks for the fixing.

Wanpeng

Re: [PATCH] x86/kvm: Fix broken irq restoration in kvm_wait

2021-03-10 Thread Wanpeng Li

ping,
On Tue, 23 Feb 2021 at 13:25, Wanpeng Li  wrote:
>
> From: Wanpeng Li 
>
> After commit 997acaf6b4b59c (lockdep: report broken irq restoration), the 
> guest
> splatting below during boot:
>
>  raw_local_irq_restore() called with IRQs enabled
>  WARNING: CPU: 1 PID: 169 at kernel/locking/irqflag-debug.c:10 
> warn_bogus_irq_restore+0x26/0x30
>  Modules linked in: hid_generic usbhid hid
>  CPU: 1 PID: 169 Comm: systemd-udevd Not tainted 5.11.0+ #25
>  RIP: 0010:warn_bogus_irq_restore+0x26/0x30
>  Call Trace:
>   kvm_wait+0x76/0x90
>   __pv_queued_spin_lock_slowpath+0x285/0x2e0
>   do_raw_spin_lock+0xc9/0xd0
>   _raw_spin_lock+0x59/0x70
>   lockref_get_not_dead+0xf/0x50
>   __legitimize_path+0x31/0x60
>   legitimize_root+0x37/0x50
>   try_to_unlazy_next+0x7f/0x1d0
>   lookup_fast+0xb0/0x170
>   path_openat+0x165/0x9b0
>   do_filp_open+0x99/0x110
>   do_sys_openat2+0x1f1/0x2e0
>   do_sys_open+0x5c/0x80
>   __x64_sys_open+0x21/0x30
>   do_syscall_64+0x32/0x50
>   entry_SYSCALL_64_after_hwframe+0x44/0xae
>
> The irqflags handling in kvm_wait() which ends up doing:
>
> local_irq_save(flags);
> safe_halt();
> local_irq_restore(flags);
>
> which triggered a new consistency checking, we generally expect
> local_irq_save() and local_irq_restore() to be pared and sanely
> nested, and so local_irq_restore() expects to be called with
> irqs disabled.
>
> This patch fixes it by adding a local_irq_disable() after safe_halt()
> to avoid this warning.
>
> Cc: Mark Rutland 
> Cc: Thomas Gleixner 
> Signed-off-by: Wanpeng Li 
> ---
>  arch/x86/kernel/kvm.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index 5e78e01..688c84a 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -853,8 +853,10 @@ static void kvm_wait(u8 *ptr, u8 val)
>  */
> if (arch_irqs_disabled_flags(flags))
> halt();
> -   else
> +   else {
> safe_halt();
> +   local_irq_disable();
> +   }
>
>  out:
> local_irq_restore(flags);
> --
> 2.7.4
>

Re: [PATCH v2] KVM: LAPIC: Advancing the timer expiration on guest initiated write

2021-03-10 Thread Wanpeng Li

ping, :)
On Thu, 4 Mar 2021 at 08:35, Wanpeng Li  wrote:
>
> From: Wanpeng Li 
>
> Advancing the timer expiration should only be necessary on guest initiated
> writes. When we cancel the timer and clear .pending during state restore,
> clear expired_tscdeadline as well.
>
> Reviewed-by: Sean Christopherson 
> Signed-off-by: Wanpeng Li 
> ---
> v1 -> v2:
>  * update patch description
>
>  arch/x86/kvm/lapic.c | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 45d40bf..f2b6e79 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -2595,6 +2595,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct 
> kvm_lapic_state *s)
>
> apic_update_ppr(apic);
> hrtimer_cancel(>lapic_timer.timer);
> +   apic->lapic_timer.expired_tscdeadline = 0;
> apic_update_lvtt(apic);
> apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
> update_divide_count(apic);
> --
> 2.7.4
>

Re: [PATCH v4] KVM: kvmclock: Fix vCPUs > 64 can't be online/hotpluged

2021-03-10 Thread Wanpeng Li

ping, :)
On Wed, 24 Feb 2021 at 09:38, Wanpeng Li  wrote:
>
> From: Wanpeng Li 
>
> # lscpu
> Architecture:  x86_64
> CPU op-mode(s):32-bit, 64-bit
> Byte Order:Little Endian
> CPU(s):88
> On-line CPU(s) list:   0-63
> Off-line CPU(s) list:  64-87
>
> # cat /proc/cmdline
> BOOT_IMAGE=/vmlinuz-5.10.0-rc3-tlinux2-0050+ root=/dev/mapper/cl-root ro
> rd.lvm.lv=cl/root rhgb quiet console=ttyS0 LANG=en_US .UTF-8 
> no-kvmclock-vsyscall
>
> # echo 1 > /sys/devices/system/cpu/cpu76/online
> -bash: echo: write error: Cannot allocate memory
>
> The per-cpu vsyscall pvclock data pointer assigns either an element of the
> static array hv_clock_boot (#vCPU <= 64) or dynamically allocated memory
> hvclock_mem (vCPU > 64), the dynamically memory will not be allocated if
> kvmclock vsyscall is disabled, this can result in cpu hotpluged fails in
> kvmclock_setup_percpu() which returns -ENOMEM. It's broken for no-vsyscall
> and sometimes you end up with vsyscall disabled if the host does something
> strange. This patch fixes it by allocating this dynamically memory
> unconditionally even if vsyscall is disabled.
>
> Fixes: 6a1cac56f4 ("x86/kvm: Use __bss_decrypted attribute in shared 
> variables")
> Reported-by: Zelin Deng 
> Cc: Brijesh Singh 
> Cc: sta...@vger.kernel.org#v4.19-rc5+
> Signed-off-by: Wanpeng Li 
> ---
> v3 -> v4:
>  * fix kernel test robot report WARNING
> v2 -> v3:
>  * allocate dynamically memory unconditionally
> v1 -> v2:
>  * add code comments
>
>  arch/x86/kernel/kvmclock.c | 19 +--
>  1 file changed, 9 insertions(+), 10 deletions(-)
>
> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
> index aa59374..1fc0962 100644
> --- a/arch/x86/kernel/kvmclock.c
> +++ b/arch/x86/kernel/kvmclock.c
> @@ -268,21 +268,20 @@ static void __init kvmclock_init_mem(void)
>
>  static int __init kvm_setup_vsyscall_timeinfo(void)
>  {
> -#ifdef CONFIG_X86_64
> -   u8 flags;
> +   kvmclock_init_mem();
>
> -   if (!per_cpu(hv_clock_per_cpu, 0) || !kvmclock_vsyscall)
> -   return 0;
> +#ifdef CONFIG_X86_64
> +   if (per_cpu(hv_clock_per_cpu, 0) && kvmclock_vsyscall) {
> +   u8 flags;
>
> -   flags = pvclock_read_flags(_clock_boot[0].pvti);
> -   if (!(flags & PVCLOCK_TSC_STABLE_BIT))
> -   return 0;
> +   flags = pvclock_read_flags(_clock_boot[0].pvti);
> +   if (!(flags & PVCLOCK_TSC_STABLE_BIT))
> +   return 0;
>
> -   kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
> +   kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
> +   }
>  #endif
>
> -   kvmclock_init_mem();
> -
> return 0;
>  }
>  early_initcall(kvm_setup_vsyscall_timeinfo);
> --
> 2.7.4
>

[PATCH v2] KVM: LAPIC: Advancing the timer expiration on guest initiated write

2021-03-03 Thread Wanpeng Li

From: Wanpeng Li 

Advancing the timer expiration should only be necessary on guest initiated 
writes. When we cancel the timer and clear .pending during state restore, 
clear expired_tscdeadline as well.

Reviewed-by: Sean Christopherson 
Signed-off-by: Wanpeng Li 
---
v1 -> v2:
 * update patch description

 arch/x86/kvm/lapic.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 45d40bf..f2b6e79 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2595,6 +2595,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct 
kvm_lapic_state *s)
 
apic_update_ppr(apic);
hrtimer_cancel(>lapic_timer.timer);
+   apic->lapic_timer.expired_tscdeadline = 0;
apic_update_lvtt(apic);
apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
update_divide_count(apic);
-- 
2.7.4

Re: [PATCH] KVM: LAPIC: Advancing the timer expiration on guest initiated write

2021-03-03 Thread Wanpeng Li

On Wed, 3 Mar 2021 at 01:16, Sean Christopherson  wrote:
>
> On Tue, Mar 02, 2021, Wanpeng Li wrote:
> > From: Wanpeng Li 
> >
> > Advancing the timer expiration should only be necessary on guest initiated
> > writes. Now, we cancel the timer, clear .pending and clear 
> > expired_tscdeadline
> > at the same time during state restore.
>
> That last sentence is confusing.  kvm_apic_set_state() already clears 
> .pending,
> by way of __start_apic_timer().  I think what you mean is:
>
>   When we cancel the timer and clear .pending during state restore, clear
>   expired_tscdeadline as well.

Good statement. :)

>
> With that,
>
> Reviewed-by: Sean Christopherson 
>
>
> Side topic, I think there's a theoretical bug where KVM could inject a 
> spurious
> timer interrupt.  If KVM is using hrtimer, the hrtimer expires early due to an
> overzealous timer_advance_ns, and the guest writes MSR_TSCDEADLINE after the
> hrtimer expires but before the vCPU is kicked, then KVM will inject a spurious
> timer IRQ since the premature expiration should have been canceled by the 
> guest's
> WRMSR.
>
> It could also cause KVM to soft hang the guest if the new 
> lapic_timer.tscdeadline
> is written before apic_timer_expired() captures it in expired_tscdeadline.  In
> that case, KVM will wait for the new deadline, which could be far in the 
> future.

The hrtimer_cancel() before setting new lapic_timer.tscdeadline in
kvm_set_lapic_tscdeadline_msr() will wait for the hrtimer callback
function to finish. Could it solve this issue?

Wanpeng

[PATCH] KVM: LAPIC: Advancing the timer expiration on guest initiated write

2021-03-02 Thread Wanpeng Li

From: Wanpeng Li 

Advancing the timer expiration should only be necessary on guest initiated 
writes. Now, we cancel the timer, clear .pending and clear expired_tscdeadline 
at the same time during state restore.

Signed-off-by: Wanpeng Li 
---
 arch/x86/kvm/lapic.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 45d40bf..f2b6e79 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2595,6 +2595,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct 
kvm_lapic_state *s)
 
apic_update_ppr(apic);
hrtimer_cancel(>lapic_timer.timer);
+   apic->lapic_timer.expired_tscdeadline = 0;
apic_update_lvtt(apic);
apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
update_divide_count(apic);
-- 
2.7.4

[PATCH] KVM: x86: hyper-v: Fix Hyper-V context null-ptr-deref

2021-02-26 Thread Wanpeng Li

From: Wanpeng Li 

Reported by syzkaller:

KASAN: null-ptr-deref in range [0x0140-0x0147]
CPU: 1 PID: 8370 Comm: syz-executor859 Not tainted 5.11.0-syzkaller #0
RIP: 0010:synic_get arch/x86/kvm/hyperv.c:165 [inline]
RIP: 0010:kvm_hv_set_sint_gsi arch/x86/kvm/hyperv.c:475 [inline]
RIP: 0010:kvm_hv_irq_routing_update+0x230/0x460 arch/x86/kvm/hyperv.c:498
Call Trace:
 kvm_set_irq_routing+0x69b/0x940 
arch/x86/kvm/../../../virt/kvm/irqchip.c:223
 kvm_vm_ioctl+0x12d0/0x2800 arch/x86/kvm/../../../virt/kvm/kvm_main.c:3959
 vfs_ioctl fs/ioctl.c:48 [inline]
 __do_sys_ioctl fs/ioctl.c:753 [inline]
 __se_sys_ioctl fs/ioctl.c:739 [inline]
 __x64_sys_ioctl+0x193/0x200 fs/ioctl.c:739
 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
 entry_SYSCALL_64_after_hwframe+0x44/0xae

Hyper-V context is lazily allocated until Hyper-V specific MSRs are accessed 
or SynIC is enabled. However, the syzkaller testcase sets irq routing table 
directly w/o enabling SynIC. This results in null-ptr-deref when accessing 
SynIC Hyper-V context. This patch fixes it.
 
syzkaller source: https://syzkaller.appspot.com/x/repro.c?x=163342ccd0

Reported-by: syzbot+6987f3b2dbd9eda95...@syzkaller.appspotmail.com
Fixes: 8f014550dfb1 ("KVM: x86: hyper-v: Make Hyper-V emulation enablement 
conditional")
Signed-off-by: Wanpeng Li 
---
 arch/x86/kvm/hyperv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 7d2dae9..58fa8c0 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -159,7 +159,7 @@ static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, 
u32 vpidx)
struct kvm_vcpu_hv_synic *synic;
 
vcpu = get_vcpu_by_vpidx(kvm, vpidx);
-   if (!vcpu)
+   if (!vcpu || !to_hv_vcpu(vcpu))
return NULL;
synic = to_hv_synic(vcpu);
return (synic->active) ? synic : NULL;
-- 
2.7.4

Re: general protection fault in kvm_hv_irq_routing_update

2021-02-25 Thread Wanpeng Li

On Fri, 26 Feb 2021 at 15:01, syzbot
 wrote:
>
> Hello,
>
> syzbot found the following issue on:
>
> HEAD commit:a99163e9 Merge tag 'devicetree-for-5.12' of git://git.kern..
> git tree:   upstream
> console output: https://syzkaller.appspot.com/x/log.txt?x=12d72682d0
> kernel config:  https://syzkaller.appspot.com/x/.config?x=7a875029a795d230
> dashboard link: https://syzkaller.appspot.com/bug?extid=6987f3b2dbd9eda95f12
> syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=12faef12d0
> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=163342ccd0
>
> The issue was bisected to:
>
> commit 8f014550dfb114cc7f42a517d20d2cf887a0b771
> Author: Vitaly Kuznetsov 
> Date:   Tue Jan 26 13:48:14 2021 +
>
> KVM: x86: hyper-v: Make Hyper-V emulation enablement conditional
>
> bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=10df16a8d0
> final oops: https://syzkaller.appspot.com/x/report.txt?x=12df16a8d0
> console output: https://syzkaller.appspot.com/x/log.txt?x=14df16a8d0
>
> IMPORTANT: if you fix the issue, please add the following tag to the commit:
> Reported-by: syzbot+6987f3b2dbd9eda95...@syzkaller.appspotmail.com
> Fixes: 8f014550dfb1 ("KVM: x86: hyper-v: Make Hyper-V emulation enablement 
> conditional")
>
> L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and 
> https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for 
> details.
> general protection fault, probably for non-canonical address 
> 0xdc28:  [#1] PREEMPT SMP KASAN
> KASAN: null-ptr-deref in range [0x0140-0x0147]
> CPU: 1 PID: 8370 Comm: syz-executor859 Not tainted 5.11.0-syzkaller #0
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS 
> Google 01/01/2011
> RIP: 0010:synic_get arch/x86/kvm/hyperv.c:165 [inline]
> RIP: 0010:kvm_hv_set_sint_gsi arch/x86/kvm/hyperv.c:475 [inline]
> RIP: 0010:kvm_hv_irq_routing_update+0x230/0x460 arch/x86/kvm/hyperv.c:498
> Code: 80 19 00 00 48 89 f8 48 c1 e8 03 80 3c 28 00 0f 85 ff 01 00 00 4d 8b ad 
> 80 19 00 00 49 8d bd 40 01 00 00 48 89 f8 48 c1 e8 03 <0f> b6 04 28 84 c0 74 
> 06 0f 8e d2 01 00 00 45 0f b6 bd 40 01 00 00
> RSP: 0018:c90001b3fac0 EFLAGS: 00010206
> RAX: 0028 RBX: 888012df5900 RCX: 
> RDX: 888022193780 RSI: 81174d43 RDI: 0140
> RBP: dc00 R08:  R09: c900018819eb
> R10: 81170f3e R11:  R12: 
> R13:  R14:  R15: 0001
> FS:  00a73300() GS:8880b9d0() knlGS:
> CS:  0010 DS:  ES:  CR0: 80050033
> CR2: 557e8c876888 CR3: 13c0b000 CR4: 001526e0
> DR0:  DR1:  DR2: 
> DR3:  DR6: fffe0ff0 DR7: 0400
> Call Trace:
>  kvm_set_irq_routing+0x69b/0x940 arch/x86/kvm/../../../virt/kvm/irqchip.c:223
>  kvm_vm_ioctl+0x12d0/0x2800 arch/x86/kvm/../../../virt/kvm/kvm_main.c:3959
>  vfs_ioctl fs/ioctl.c:48 [inline]
>  __do_sys_ioctl fs/ioctl.c:753 [inline]
>  __se_sys_ioctl fs/ioctl.c:739 [inline]
>  __x64_sys_ioctl+0x193/0x200 fs/ioctl.c:739
>  do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
>  entry_SYSCALL_64_after_hwframe+0x44/0xae
> RIP: 0033:0x43ef29
> Code: 28 c3 e8 2a 14 00 00 66 2e 0f 1f 84 00 00 00 00 00 48 89 f8 48 89 f7 48 
> 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 
> 01 c3 48 c7 c1 c0 ff ff ff f7 d8 64 89 01 48
> RSP: 002b:7ffe391eb808 EFLAGS: 0246 ORIG_RAX: 0010
> RAX: ffda RBX: 00400488 RCX: 0043ef29
> RDX: 2140 RSI: 4008ae6a RDI: 0004
> RBP: 00402f10 R08: 00400488 R09: 00400488
> R10: 00400488 R11: 0246 R12: 00402fa0
> R13:  R14: 004ac018 R15: 00400488
> Modules linked in:
> ---[ end trace 2aa75ec1dd148710 ]---
> RIP: 0010:synic_get arch/x86/kvm/hyperv.c:165 [inline]

Missing check to_hv_vcpu(vcpu) in synic_get(), I will fix it.

> RIP: 0010:kvm_hv_set_sint_gsi arch/x86/kvm/hyperv.c:475 [inline]
> RIP: 0010:kvm_hv_irq_routing_update+0x230/0x460 arch/x86/kvm/hyperv.c:498
> Code: 80 19 00 00 48 89 f8 48 c1 e8 03 80 3c 28 00 0f 85 ff 01 00 00 4d 8b ad 
> 80 19 00 00 49 8d bd 40 01 00 00 48 89 f8 48 c1 e8 03 <0f> b6 04 28 84 c0 74 
> 06 0f 8e d2 01 00 00 45 0f b6 bd 40 01 00 00
> RSP: 0018:c90001b3fac0 EFLAGS: 00010206
> RAX: 0028 RBX: 888012df5900 RCX: 
> RDX: 888022193780 RSI: 81174d43 RDI: 0140
> RBP: dc00 R08:  R09: c900018819eb
> R10: 81170f3e R11:  R12: 
> R13:  R14:  R15: 0001
> FS:  00a73300() GS:8880b9d0()

[PATCH v4] KVM: kvmclock: Fix vCPUs > 64 can't be online/hotpluged

2021-02-23 Thread Wanpeng Li

From: Wanpeng Li 

# lscpu
Architecture:  x86_64
CPU op-mode(s):32-bit, 64-bit
Byte Order:Little Endian
CPU(s):88
On-line CPU(s) list:   0-63
Off-line CPU(s) list:  64-87

# cat /proc/cmdline
BOOT_IMAGE=/vmlinuz-5.10.0-rc3-tlinux2-0050+ root=/dev/mapper/cl-root ro 
rd.lvm.lv=cl/root rhgb quiet console=ttyS0 LANG=en_US .UTF-8 
no-kvmclock-vsyscall

# echo 1 > /sys/devices/system/cpu/cpu76/online
-bash: echo: write error: Cannot allocate memory

The per-cpu vsyscall pvclock data pointer assigns either an element of the 
static array hv_clock_boot (#vCPU <= 64) or dynamically allocated memory 
hvclock_mem (vCPU > 64), the dynamically memory will not be allocated if 
kvmclock vsyscall is disabled, this can result in cpu hotpluged fails in 
kvmclock_setup_percpu() which returns -ENOMEM. It's broken for no-vsyscall
and sometimes you end up with vsyscall disabled if the host does something 
strange. This patch fixes it by allocating this dynamically memory 
unconditionally even if vsyscall is disabled.

Fixes: 6a1cac56f4 ("x86/kvm: Use __bss_decrypted attribute in shared variables")
Reported-by: Zelin Deng 
Cc: Brijesh Singh 
Cc: sta...@vger.kernel.org#v4.19-rc5+
Signed-off-by: Wanpeng Li 
---
v3 -> v4:
 * fix kernel test robot report WARNING
v2 -> v3:
 * allocate dynamically memory unconditionally
v1 -> v2:
 * add code comments

 arch/x86/kernel/kvmclock.c | 19 +--
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index aa59374..1fc0962 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -268,21 +268,20 @@ static void __init kvmclock_init_mem(void)
 
 static int __init kvm_setup_vsyscall_timeinfo(void)
 {
-#ifdef CONFIG_X86_64
-   u8 flags;
+   kvmclock_init_mem();
 
-   if (!per_cpu(hv_clock_per_cpu, 0) || !kvmclock_vsyscall)
-   return 0;
+#ifdef CONFIG_X86_64
+   if (per_cpu(hv_clock_per_cpu, 0) && kvmclock_vsyscall) {
+   u8 flags;
 
-   flags = pvclock_read_flags(_clock_boot[0].pvti);
-   if (!(flags & PVCLOCK_TSC_STABLE_BIT))
-   return 0;
+   flags = pvclock_read_flags(_clock_boot[0].pvti);
+   if (!(flags & PVCLOCK_TSC_STABLE_BIT))
+   return 0;
 
-   kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
+   kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
+   }
 #endif
 
-   kvmclock_init_mem();
-
return 0;
 }
 early_initcall(kvm_setup_vsyscall_timeinfo);
-- 
2.7.4

Re: [PATCH] x86/kvm: Fix broken irq restoration in kvm_wait

2021-02-22 Thread Wanpeng Li

On Tue, 23 Feb 2021 at 13:25, Wanpeng Li  wrote:
>
> From: Wanpeng Li 
>
> After commit 997acaf6b4b59c (lockdep: report broken irq restoration), the 
> guest
> splatting below during boot:
>
>  raw_local_irq_restore() called with IRQs enabled
>  WARNING: CPU: 1 PID: 169 at kernel/locking/irqflag-debug.c:10 
> warn_bogus_irq_restore+0x26/0x30
>  Modules linked in: hid_generic usbhid hid
>  CPU: 1 PID: 169 Comm: systemd-udevd Not tainted 5.11.0+ #25
>  RIP: 0010:warn_bogus_irq_restore+0x26/0x30
>  Call Trace:
>   kvm_wait+0x76/0x90
>   __pv_queued_spin_lock_slowpath+0x285/0x2e0
>   do_raw_spin_lock+0xc9/0xd0
>   _raw_spin_lock+0x59/0x70
>   lockref_get_not_dead+0xf/0x50
>   __legitimize_path+0x31/0x60
>   legitimize_root+0x37/0x50
>   try_to_unlazy_next+0x7f/0x1d0
>   lookup_fast+0xb0/0x170
>   path_openat+0x165/0x9b0
>   do_filp_open+0x99/0x110
>   do_sys_openat2+0x1f1/0x2e0
>   do_sys_open+0x5c/0x80
>   __x64_sys_open+0x21/0x30
>   do_syscall_64+0x32/0x50
>   entry_SYSCALL_64_after_hwframe+0x44/0xae
>
> The irqflags handling in kvm_wait() which ends up doing:
>
> local_irq_save(flags);
> safe_halt();
> local_irq_restore(flags);
>
> which triggered a new consistency checking, we generally expect
> local_irq_save() and local_irq_restore() to be pared and sanely
> nested, and so local_irq_restore() expects to be called with
> irqs disabled.
>
> This patch fixes it by adding a local_irq_disable() after safe_halt()
> to avoid this warning.
>
> Cc: Mark Rutland 
> Cc: Thomas Gleixner 
> Signed-off-by: Wanpeng Li 
> ---
>  arch/x86/kernel/kvm.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index 5e78e01..688c84a 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -853,8 +853,10 @@ static void kvm_wait(u8 *ptr, u8 val)
>  */
> if (arch_irqs_disabled_flags(flags))
> halt();
> -   else
> +   else {
> safe_halt();
> +   local_irq_disable();
> +   }

An alternative fix:

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 5e78e01..7127aef 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -836,12 +836,13 @@ static void kvm_kick_cpu(int cpu)

 static void kvm_wait(u8 *ptr, u8 val)
 {
-unsigned long flags;
+bool disabled = irqs_disabled();

 if (in_nmi())
 return;

-local_irq_save(flags);
+if (!disabled)
+local_irq_disable();

 if (READ_ONCE(*ptr) != val)
 goto out;
@@ -851,13 +852,14 @@ static void kvm_wait(u8 *ptr, u8 val)
  * for irq enabled case to avoid hang when lock info is overwritten
  * in irq spinlock slowpath and no spurious interrupt occur to save us.
  */
-if (arch_irqs_disabled_flags(flags))
+if (disabled)
 halt();
 else
 safe_halt();

 out:
-local_irq_restore(flags);
+if (!disabled)
+local_irq_enable();
 }

 #ifdef CONFIG_X86_32

[PATCH] x86/kvm: Fix broken irq restoration in kvm_wait

2021-02-22 Thread Wanpeng Li

From: Wanpeng Li 

After commit 997acaf6b4b59c (lockdep: report broken irq restoration), the guest 
splatting below during boot:

 raw_local_irq_restore() called with IRQs enabled
 WARNING: CPU: 1 PID: 169 at kernel/locking/irqflag-debug.c:10 
warn_bogus_irq_restore+0x26/0x30
 Modules linked in: hid_generic usbhid hid
 CPU: 1 PID: 169 Comm: systemd-udevd Not tainted 5.11.0+ #25
 RIP: 0010:warn_bogus_irq_restore+0x26/0x30
 Call Trace:
  kvm_wait+0x76/0x90
  __pv_queued_spin_lock_slowpath+0x285/0x2e0
  do_raw_spin_lock+0xc9/0xd0
  _raw_spin_lock+0x59/0x70
  lockref_get_not_dead+0xf/0x50
  __legitimize_path+0x31/0x60
  legitimize_root+0x37/0x50
  try_to_unlazy_next+0x7f/0x1d0
  lookup_fast+0xb0/0x170
  path_openat+0x165/0x9b0
  do_filp_open+0x99/0x110
  do_sys_openat2+0x1f1/0x2e0
  do_sys_open+0x5c/0x80
  __x64_sys_open+0x21/0x30
  do_syscall_64+0x32/0x50
  entry_SYSCALL_64_after_hwframe+0x44/0xae

The irqflags handling in kvm_wait() which ends up doing:

local_irq_save(flags);
safe_halt();
local_irq_restore(flags);

which triggered a new consistency checking, we generally expect 
local_irq_save() and local_irq_restore() to be pared and sanely 
nested, and so local_irq_restore() expects to be called with 
irqs disabled. 

This patch fixes it by adding a local_irq_disable() after safe_halt() 
to avoid this warning.

Cc: Mark Rutland 
Cc: Thomas Gleixner 
Signed-off-by: Wanpeng Li 
---
 arch/x86/kernel/kvm.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 5e78e01..688c84a 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -853,8 +853,10 @@ static void kvm_wait(u8 *ptr, u8 val)
 */
if (arch_irqs_disabled_flags(flags))
halt();
-   else
+   else {
safe_halt();
+   local_irq_disable();
+   }
 
 out:
local_irq_restore(flags);
-- 
2.7.4

[PATCH v3] KVM: kvmclock: Fix vCPUs > 64 can't be online/hotpluged

2021-02-01 Thread Wanpeng Li

From: Wanpeng Li 

The per-cpu vsyscall pvclock data pointer assigns either an element of the 
static array hv_clock_boot (#vCPU <= 64) or dynamically allocated memory 
hvclock_mem (vCPU > 64), the dynamically memory will not be allocated if 
kvmclock vsyscall is disabled, this can result in cpu hotpluged fails in 
kvmclock_setup_percpu() which returns -ENOMEM. It's broken for no-vsyscall
and sometimes you end up with vsyscall disabled if the host does something 
strange. This patch fixes it by allocating this dynamically memory 
unconditionally even if vsyscall is disabled.

Fixes: 6a1cac56f4 ("x86/kvm: Use __bss_decrypted attribute in shared variables")
Reported-by: Zelin Deng 
Tested-by: Haiwei Li 
Cc: Brijesh Singh 
Cc: sta...@vger.kernel.org#v4.19-rc5+
Signed-off-by: Wanpeng Li 
---
v2 -> v3:
 * allocate dynamically memory unconditionally
v1 -> v2:
 * add code comments

 arch/x86/kernel/kvmclock.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index aa59374..a72b16e 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -268,6 +268,8 @@ static void __init kvmclock_init_mem(void)
 
 static int __init kvm_setup_vsyscall_timeinfo(void)
 {
+   kvmclock_init_mem();
+
 #ifdef CONFIG_X86_64
u8 flags;
 
@@ -281,8 +283,6 @@ static int __init kvm_setup_vsyscall_timeinfo(void)
kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
 #endif
 
-   kvmclock_init_mem();
-
return 0;
 }
 early_initcall(kvm_setup_vsyscall_timeinfo);
-- 
2.7.4

Re: [PATCH v2] KVM: kvmclock: Fix vCPUs > 64 can't be online/hotpluged

2021-01-27 Thread Wanpeng Li

On Wed, 27 Jan 2021 at 08:28, Wanpeng Li  wrote:
>
> On Wed, 27 Jan 2021 at 01:26, Paolo Bonzini  wrote:
> >
> > On 26/01/21 02:28, Wanpeng Li wrote:
> > > ping，
> > > On Mon, 18 Jan 2021 at 17:08, Wanpeng Li  wrote:
> > >>
> > >> From: Wanpeng Li 
> > >>
> > >> The per-cpu vsyscall pvclock data pointer assigns either an element of 
> > >> the
> > >> static array hv_clock_boot (#vCPU <= 64) or dynamically allocated memory
> > >> hvclock_mem (vCPU > 64), the dynamically memory will not be allocated if
> > >> kvmclock vsyscall is disabled, this can result in cpu hotpluged fails in
> > >> kvmclock_setup_percpu() which returns -ENOMEM. This patch fixes it by not
> > >> assigning vsyscall pvclock data pointer if kvmclock vdso_clock_mode is 
> > >> not
> > >> VDSO_CLOCKMODE_PVCLOCK.
> >
> > I am sorry, I still cannot figure out this patch.
> >
> > Is hotplug still broken if kvm vsyscall is enabled?
>
> Just when kvm vsyscall is disabled. :)
>
> # lscpu
> Architecture:   x86_64
> CPU op-mode(s):32-bit, 64-bit
> Byte Order: Little Endian
> CPU(s):   88
> On-line CPU(s) list:   0-63
> Off-line CPU(s) list:  64-87
>
> # cat /proc/cmdline
> BOOT_IMAGE=/vmlinuz-5.10.0-rc3-tlinux2-0050+ root=/dev/mapper/cl-root
> ro rd.lvm.lv=cl/root rhgb quiet console=ttyS0 LANG=en_US
> .UTF-8 no-kvmclock-vsyscall
>
> # echo 1 > /sys/devices/system/cpu/cpu76/online
> -bash: echo: write error: Cannot allocate memory

The original bug report is here.
https://bugzilla.kernel.org/show_bug.cgi?id=210213

Wanpeng

Re: [PATCH v2] KVM: kvmclock: Fix vCPUs > 64 can't be online/hotpluged

2021-01-26 Thread Wanpeng Li

On Wed, 27 Jan 2021 at 01:26, Paolo Bonzini  wrote:
>
> On 26/01/21 02:28, Wanpeng Li wrote:
> > ping，
> > On Mon, 18 Jan 2021 at 17:08, Wanpeng Li  wrote:
> >>
> >> From: Wanpeng Li 
> >>
> >> The per-cpu vsyscall pvclock data pointer assigns either an element of the
> >> static array hv_clock_boot (#vCPU <= 64) or dynamically allocated memory
> >> hvclock_mem (vCPU > 64), the dynamically memory will not be allocated if
> >> kvmclock vsyscall is disabled, this can result in cpu hotpluged fails in
> >> kvmclock_setup_percpu() which returns -ENOMEM. This patch fixes it by not
> >> assigning vsyscall pvclock data pointer if kvmclock vdso_clock_mode is not
> >> VDSO_CLOCKMODE_PVCLOCK.
>
> I am sorry, I still cannot figure out this patch.
>
> Is hotplug still broken if kvm vsyscall is enabled?

Just when kvm vsyscall is disabled. :)

# lscpu
Architecture:   x86_64
CPU op-mode(s):32-bit, 64-bit
Byte Order: Little Endian
CPU(s):   88
On-line CPU(s) list:   0-63
Off-line CPU(s) list:  64-87

# cat /proc/cmdline
BOOT_IMAGE=/vmlinuz-5.10.0-rc3-tlinux2-0050+ root=/dev/mapper/cl-root
ro rd.lvm.lv=cl/root rhgb quiet console=ttyS0 LANG=en_US
.UTF-8 no-kvmclock-vsyscall

# echo 1 > /sys/devices/system/cpu/cpu76/online
-bash: echo: write error: Cannot allocate memory

Wanpeng

Re: [PATCH v2] KVM: kvmclock: Fix vCPUs > 64 can't be online/hotpluged

2021-01-25 Thread Wanpeng Li

ping，
On Mon, 18 Jan 2021 at 17:08, Wanpeng Li  wrote:
>
> From: Wanpeng Li 
>
> The per-cpu vsyscall pvclock data pointer assigns either an element of the
> static array hv_clock_boot (#vCPU <= 64) or dynamically allocated memory
> hvclock_mem (vCPU > 64), the dynamically memory will not be allocated if
> kvmclock vsyscall is disabled, this can result in cpu hotpluged fails in
> kvmclock_setup_percpu() which returns -ENOMEM. This patch fixes it by not
> assigning vsyscall pvclock data pointer if kvmclock vdso_clock_mode is not
> VDSO_CLOCKMODE_PVCLOCK.
>
> Fixes: 6a1cac56f4 ("x86/kvm: Use __bss_decrypted attribute in shared 
> variables")
> Reported-by: Zelin Deng 
> Tested-by: Haiwei Li 
> Cc: Brijesh Singh 
> Cc: sta...@vger.kernel.org#v4.19-rc5+
> Signed-off-by: Wanpeng Li 
> ---
> v1 -> v2:
>  * add code comments
>
>  arch/x86/kernel/kvmclock.c | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
> index aa59374..01d4e55c 100644
> --- a/arch/x86/kernel/kvmclock.c
> +++ b/arch/x86/kernel/kvmclock.c
> @@ -294,9 +294,11 @@ static int kvmclock_setup_percpu(unsigned int cpu)
> /*
>  * The per cpu area setup replicates CPU0 data to all cpu
>  * pointers. So carefully check. CPU0 has been set up in init
> -* already.
> +* already. Assign vsyscall pvclock data pointer iff kvmclock
> +* vsyscall is enabled.
>  */
> -   if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0)))
> +   if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0)) ||
> +   (kvm_clock.vdso_clock_mode != VDSO_CLOCKMODE_PVCLOCK))
> return 0;
>
> /* Use the static page for the first CPUs, allocate otherwise */
> --
> 2.7.4
>

Re: [PATCH] Revert "KVM: x86: Unconditionally enable irqs in guest context"

2021-01-18 Thread Wanpeng Li

On Fri, 15 Jan 2021 at 11:20, Wanpeng Li  wrote:
>
> On Wed, 6 Jan 2021 at 08:51, Sean Christopherson  wrote:
> >
> > +tglx
> >
> > On Tue, Jan 05, 2021, Nitesh Narayan Lal wrote:
> > > This reverts commit d7a08882a0a4b4e176691331ee3f492996579534.
> > >
> > > After the introduction of the patch:
> > >
> > >   87fa7f3e9: x86/kvm: Move context tracking where it belongs
> > >
> > > since we have moved guest_exit_irqoff closer to the VM-Exit, explicit
> > > enabling of irqs to process pending interrupts should not be required
> > > within vcpu_enter_guest anymore.
> >
> > Ugh, except that commit completely broke tick-based accounting, on both 
> > Intel
> > and AMD.  With guest_exit_irqoff() being called immediately after VM-Exit, 
> > any
> > tick that happens after IRQs are disabled will be accounted to the host.  
> > E.g.
> > on Intel, even an IRQ VM-Exit that has already been acked by the CPU isn't
> > processed until kvm_x86_ops.handle_exit_irqoff(), well after PF_VCPU has 
> > been
> > cleared.
> >
>
> This issue can be 100% reproduced.
> https://bugzilla.kernel.org/show_bug.cgi?id=204177

Sorry, the posted link should be
https://bugzilla.kernel.org/show_bug.cgi?id=209831

Wanpeng

Re: [PATCH] KVM: kvmclock: Fix vCPUs > 64 can't be online/hotpluged

2021-01-18 Thread Wanpeng Li

On Tue, 19 Jan 2021 at 02:27, Paolo Bonzini  wrote:
>
> On 15/01/21 02:15, Wanpeng Li wrote:
> >> The comment above should probably be updated as it is not clear why we
> >> check kvm_clock.vdso_clock_mode here. Actually, I would even suggest we
> >> introduce a 'kvmclock_tsc_stable' global instead to avoid this indirect
> >> check.
> > I prefer to update the comment above, assign vsyscall pvclock data
> > pointer iff kvmclock vsyscall is enabled.
>
> Are you going to send v2?

Yes. :) 
https://lore.kernel.org/kvm/1610960877-3110-1-git-send-email-wanpen...@tencent.com/

Wanpeng

[PATCH v2] KVM: kvmclock: Fix vCPUs > 64 can't be online/hotpluged

2021-01-18 Thread Wanpeng Li

From: Wanpeng Li 

The per-cpu vsyscall pvclock data pointer assigns either an element of the 
static array hv_clock_boot (#vCPU <= 64) or dynamically allocated memory 
hvclock_mem (vCPU > 64), the dynamically memory will not be allocated if 
kvmclock vsyscall is disabled, this can result in cpu hotpluged fails in 
kvmclock_setup_percpu() which returns -ENOMEM. This patch fixes it by not 
assigning vsyscall pvclock data pointer if kvmclock vdso_clock_mode is not 
VDSO_CLOCKMODE_PVCLOCK.

Fixes: 6a1cac56f4 ("x86/kvm: Use __bss_decrypted attribute in shared variables")
Reported-by: Zelin Deng 
Tested-by: Haiwei Li 
Cc: Brijesh Singh 
Cc: sta...@vger.kernel.org#v4.19-rc5+
Signed-off-by: Wanpeng Li 
---
v1 -> v2:
 * add code comments

 arch/x86/kernel/kvmclock.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index aa59374..01d4e55c 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -294,9 +294,11 @@ static int kvmclock_setup_percpu(unsigned int cpu)
/*
 * The per cpu area setup replicates CPU0 data to all cpu
 * pointers. So carefully check. CPU0 has been set up in init
-* already.
+* already. Assign vsyscall pvclock data pointer iff kvmclock
+* vsyscall is enabled.
 */
-   if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0)))
+   if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0)) ||
+   (kvm_clock.vdso_clock_mode != VDSO_CLOCKMODE_PVCLOCK))
return 0;
 
/* Use the static page for the first CPUs, allocate otherwise */
-- 
2.7.4

Re: [PATCH] Revert "KVM: x86: Unconditionally enable irqs in guest context"

2021-01-14 Thread Wanpeng Li

On Wed, 6 Jan 2021 at 08:51, Sean Christopherson  wrote:
>
> +tglx
>
> On Tue, Jan 05, 2021, Nitesh Narayan Lal wrote:
> > This reverts commit d7a08882a0a4b4e176691331ee3f492996579534.
> >
> > After the introduction of the patch:
> >
> >   87fa7f3e9: x86/kvm: Move context tracking where it belongs
> >
> > since we have moved guest_exit_irqoff closer to the VM-Exit, explicit
> > enabling of irqs to process pending interrupts should not be required
> > within vcpu_enter_guest anymore.
>
> Ugh, except that commit completely broke tick-based accounting, on both Intel
> and AMD.  With guest_exit_irqoff() being called immediately after VM-Exit, any
> tick that happens after IRQs are disabled will be accounted to the host.  E.g.
> on Intel, even an IRQ VM-Exit that has already been acked by the CPU isn't
> processed until kvm_x86_ops.handle_exit_irqoff(), well after PF_VCPU has been
> cleared.
>

This issue can be 100% reproduced.
https://bugzilla.kernel.org/show_bug.cgi?id=204177

> CONFIG_VIRT_CPU_ACCOUNTING_GEN=y should still work (I didn't bother to 
> verify).
>
> Thomas, any clever ideas?  Handling IRQs in {vmx,svm}_vcpu_enter_exit() isn't 
> an
> option as KVM hasn't restored enough state to handle an IRQ, e.g. PKRU and 
> XCR0
> are still guest values.  Is it too heinous to fudge PF_VCPU across KVM's
> "pending" IRQ handling?  E.g. this god-awful hack fixes the accounting:
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 836912b42030..5a777fd35b4b 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -9028,6 +9028,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> vcpu->mode = OUTSIDE_GUEST_MODE;
> smp_wmb();
>
> +   current->flags |= PF_VCPU;
> kvm_x86_ops.handle_exit_irqoff(vcpu);
>
> /*
> @@ -9042,6 +9043,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> ++vcpu->stat.exits;
> local_irq_disable();
> kvm_after_interrupt(vcpu);
> +   current->flags &= ~PF_VCPU;
>
> if (lapic_in_kernel(vcpu)) {
> s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
>
> > Conflicts:
> >   arch/x86/kvm/svm.c
> >
> > Signed-off-by: Nitesh Narayan Lal 
> > ---
> >  arch/x86/kvm/svm/svm.c |  9 +
> >  arch/x86/kvm/x86.c | 11 ---
> >  2 files changed, 9 insertions(+), 11 deletions(-)
> >
> > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> > index cce0143a6f80..c9b2fbb32484 100644
> > --- a/arch/x86/kvm/svm/svm.c
> > +++ b/arch/x86/kvm/svm/svm.c
> > @@ -4187,6 +4187,15 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
> >
> >  static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
> >  {
> > + kvm_before_interrupt(vcpu);
> > + local_irq_enable();
> > + /*
> > +  * We must have an instruction with interrupts enabled, so
> > +  * the timer interrupt isn't delayed by the interrupt shadow.
> > +  */
> > + asm("nop");
> > + local_irq_disable();
> > + kvm_after_interrupt(vcpu);
> >  }
> >
> >  static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 3f7c1fc7a3ce..3e17c9ffcad8 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -9023,18 +9023,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
> >
> >   kvm_x86_ops.handle_exit_irqoff(vcpu);
> >
> > - /*
> > -  * Consume any pending interrupts, including the possible source of
> > -  * VM-Exit on SVM and any ticks that occur between VM-Exit and now.
> > -  * An instruction is required after local_irq_enable() to fully 
> > unblock
> > -  * interrupts on processors that implement an interrupt shadow, the
> > -  * stat.exits increment will do nicely.
> > -  */
> > - kvm_before_interrupt(vcpu);
> > - local_irq_enable();
> >   ++vcpu->stat.exits;
> > - local_irq_disable();
> > - kvm_after_interrupt(vcpu);
> >
> >   if (lapic_in_kernel(vcpu)) {
> >   s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
> > --
> > 2.27.0
> >

Re: [PATCH] KVM: kvmclock: Fix vCPUs > 64 can't be online/hotpluged

2021-01-14 Thread Wanpeng Li

On Thu, 14 Jan 2021 at 21:45, Vitaly Kuznetsov  wrote:
>
> Wanpeng Li  writes:
>
> > From: Wanpeng Li 
> >
> > The per-cpu vsyscall pvclock data pointer assigns either an element of the
> > static array hv_clock_boot (#vCPU <= 64) or dynamically allocated memory
> > hvclock_mem (vCPU > 64), the dynamically memory will not be allocated if
> > kvmclock vsyscall is disabled, this can result in cpu hotpluged fails in
> > kvmclock_setup_percpu() which returns -ENOMEM. This patch fixes it by not
> > assigning vsyscall pvclock data pointer if kvmclock vdso_clock_mode is not
> > VDSO_CLOCKMODE_PVCLOCK.
> >
> > Fixes: 6a1cac56f4 ("x86/kvm: Use __bss_decrypted attribute in shared 
> > variables")
> > Reported-by: Zelin Deng 
> > Tested-by: Haiwei Li 
> > Cc: Brijesh Singh 
> > Cc: sta...@vger.kernel.org#v4.19-rc5+
> > Signed-off-by: Wanpeng Li 
> > ---
> >  arch/x86/kernel/kvmclock.c | 3 ++-
> >  1 file changed, 2 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
> > index aa59374..0624290 100644
> > --- a/arch/x86/kernel/kvmclock.c
> > +++ b/arch/x86/kernel/kvmclock.c
> > @@ -296,7 +296,8 @@ static int kvmclock_setup_percpu(unsigned int cpu)
> >* pointers. So carefully check. CPU0 has been set up in init
> >* already.
> >*/
> > - if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0)))
> > + if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0)) ||
> > + (kvm_clock.vdso_clock_mode != VDSO_CLOCKMODE_PVCLOCK))
> >   return 0;
>
> The comment above should probably be updated as it is not clear why we
> check kvm_clock.vdso_clock_mode here. Actually, I would even suggest we
> introduce a 'kvmclock_tsc_stable' global instead to avoid this indirect
> check.

I prefer to update the comment above, assign vsyscall pvclock data
pointer iff kvmclock vsyscall is enabled.

Wanpeng

[PATCH] KVM: kvmclock: Fix vCPUs > 64 can't be online/hotpluged

2021-01-14 Thread Wanpeng Li

From: Wanpeng Li 

The per-cpu vsyscall pvclock data pointer assigns either an element of the 
static array hv_clock_boot (#vCPU <= 64) or dynamically allocated memory 
hvclock_mem (vCPU > 64), the dynamically memory will not be allocated if 
kvmclock vsyscall is disabled, this can result in cpu hotpluged fails in 
kvmclock_setup_percpu() which returns -ENOMEM. This patch fixes it by not 
assigning vsyscall pvclock data pointer if kvmclock vdso_clock_mode is not 
VDSO_CLOCKMODE_PVCLOCK.

Fixes: 6a1cac56f4 ("x86/kvm: Use __bss_decrypted attribute in shared variables")
Reported-by: Zelin Deng 
Tested-by: Haiwei Li 
Cc: Brijesh Singh 
Cc: sta...@vger.kernel.org#v4.19-rc5+
Signed-off-by: Wanpeng Li 
---
 arch/x86/kernel/kvmclock.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index aa59374..0624290 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -296,7 +296,8 @@ static int kvmclock_setup_percpu(unsigned int cpu)
 * pointers. So carefully check. CPU0 has been set up in init
 * already.
 */
-   if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0)))
+   if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0)) ||
+   (kvm_clock.vdso_clock_mode != VDSO_CLOCKMODE_PVCLOCK))
return 0;
 
/* Use the static page for the first CPUs, allocate otherwise */
-- 
2.7.4

Re: [PATCH] Revert "KVM: x86: Unconditionally enable irqs in guest context"

2021-01-07 Thread Wanpeng Li

On Thu, 7 Jan 2021 at 17:35, Vitaly Kuznetsov  wrote:
>
> Sean Christopherson  writes:
>
> > On Wed, Jan 06, 2021, Vitaly Kuznetsov wrote:
> >>
> >> Looking back, I don't quite understand why we wanted to account ticks
> >> between vmexit and exiting guest context as 'guest' in the first place;
> >> to my understanging 'guest time' is time spent within VMX non-root
> >> operation, the rest is KVM overhead (system).
> >
> > With tick-based accounting, if the tick IRQ is received after PF_VCPU is 
> > cleared
> > then that tick will be accounted to the host/system.  The motivation for 
> > opening
> > an IRQ window after VM-Exit is to handle the case where the guest is 
> > constantly
> > exiting for a different reason _just_ before the tick arrives, e.g. if the 
> > guest
> > has its tick configured such that the guest and host ticks get synchronized
> > in a bad way.
> >
> > This is a non-issue when using CONFIG_VIRT_CPU_ACCOUNTING_GEN=y, at least 
> > with a
> > stable TSC, as the accounting happens during guest_exit_irqoff() itself.
> > Accounting might be less-than-stellar if TSC is unstable, but I don't think 
> > it
> > would be as binary of a failure as tick-based accounting.
> >
>
> Oh, yea, I vaguely remember we had to deal with a very similar problem
> but for userspace/kernel accounting. It was possible to observe e.g. a
> userspace task going 100% kernel while in reality it was just perfectly
> synchronized with the tick and doing a syscall just before it arrives
> (or something like that, I may be misremembering the details).

Yes. :)  commit 2a42eb9594a1 ("sched/cputime: Accumulate vtime on top
of nsec clocksource")

> So depending on the frequency, it is probably possible to e.g observe
> '100% host' with tick based accounting, the guest just has to
> synchronize exiting to KVM in a way that the tick will always arrive
> past guest_exit_irqoff().
>
> It seems to me this is a fundamental problem in case the frequency of
> guest exits can match the frequency of the time accounting tick.
>
> --
> Vitaly
>

Re: [PATCH] Revert "KVM: x86: Unconditionally enable irqs in guest context"

2021-01-05 Thread Wanpeng Li

On Wed, 6 Jan 2021 at 06:30, Nitesh Narayan Lal  wrote:
>
> This reverts commit d7a08882a0a4b4e176691331ee3f492996579534.
>
> After the introduction of the patch:
>
> 87fa7f3e9: x86/kvm: Move context tracking where it belongs
>
> since we have moved guest_exit_irqoff closer to the VM-Exit, explicit
> enabling of irqs to process pending interrupts should not be required
> within vcpu_enter_guest anymore.
>
> Conflicts:
> arch/x86/kvm/svm.c
>
> Signed-off-by: Nitesh Narayan Lal 
> ---
>  arch/x86/kvm/svm/svm.c |  9 +
>  arch/x86/kvm/x86.c | 11 ---
>  2 files changed, 9 insertions(+), 11 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index cce0143a6f80..c9b2fbb32484 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -4187,6 +4187,15 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
>
>  static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
>  {
> +   kvm_before_interrupt(vcpu);
> +   local_irq_enable();
> +   /*
> +* We must have an instruction with interrupts enabled, so
> +* the timer interrupt isn't delayed by the interrupt shadow.
> +*/
> +   asm("nop");
> +   local_irq_disable();
> +   kvm_after_interrupt(vcpu);
>  }

Why do we need to reintroduce this part?

Wanpeng

Re: [PATCH] KVM: X86: Expose KVM_HINTS_REALTIME in KVM_GET_SUPPORTED_CPUID

2020-10-22 Thread Wanpeng Li

On Thu, 22 Oct 2020 at 21:02, Paolo Bonzini  wrote:
>
> On 22/10/20 03:34, Wanpeng Li wrote:
> > From: Wanpeng Li 
> >
> > Per KVM_GET_SUPPORTED_CPUID ioctl documentation:
> >
> > This ioctl returns x86 cpuid features which are supported by both the
> > hardware and kvm in its default configuration.
> >
> > A well-behaved userspace should not set the bit if it is not supported.
> >
> > Suggested-by: Jim Mattson 
> > Signed-off-by: Wanpeng Li 
>
> It's common for userspace to copy all supported CPUID bits to
> KVM_SET_CPUID2, I don't think this is the right behavior for
> KVM_HINTS_REALTIME.
>
> (But maybe this was discussed already; if so, please point me to the
> previous discussion).

The discussion is here. :) https://www.spinics.net/lists/kvm/msg227265.html

Wanpeng

[PATCH] KVM: X86: Expose KVM_HINTS_REALTIME in KVM_GET_SUPPORTED_CPUID

2020-10-21 Thread Wanpeng Li

From: Wanpeng Li 

Per KVM_GET_SUPPORTED_CPUID ioctl documentation:

This ioctl returns x86 cpuid features which are supported by both the 
hardware and kvm in its default configuration.

A well-behaved userspace should not set the bit if it is not supported.

Suggested-by: Jim Mattson 
Signed-off-by: Wanpeng Li 
---
 arch/x86/kvm/cpuid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 06a278b..225d251 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -789,7 +789,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array 
*array, u32 function)
 
entry->ebx = 0;
entry->ecx = 0;
-   entry->edx = 0;
+   entry->edx = (1 << KVM_HINTS_REALTIME);
break;
case 0x8000:
entry->eax = min(entry->eax, 0x801f);
-- 
2.7.4

Re: [RFC V2 0/9] x86/mmu:Introduce parallel memory virtualization to boost performance

2020-09-24 Thread Wanpeng Li

Any comments? Paolo! :)
On Wed, 9 Sep 2020 at 11:04, Wanpeng Li  wrote:
>
> Any comments? guys!
> On Tue, 1 Sep 2020 at 19:52,  wrote:
> >
> > From: Yulei Zhang 
> >
> > Currently in KVM memory virtulization we relay on mmu_lock to
> > synchronize the memory mapping update, which make vCPUs work
> > in serialize mode and slow down the execution, especially after
> > migration to do substantial memory mapping will cause visible
> > performance drop, and it can get worse if guest has more vCPU
> > numbers and memories.
> >
> > The idea we present in this patch set is to mitigate the issue
> > with pre-constructed memory mapping table. We will fast pin the
> > guest memory to build up a global memory mapping table according
> > to the guest memslots changes and apply it to cr3, so that after
> > guest starts up all the vCPUs would be able to update the memory
> > simultaneously without page fault exception, thus the performance
> > improvement is expected.
> >
> > We use memory dirty pattern workload to test the initial patch
> > set and get positive result even with huge page enabled. For example,
> > we create guest with 32 vCPUs and 64G memories, and let the vcpus
> > dirty the entire memory region concurrently, as the initial patch
> > eliminate the overhead of mmu_lock, in 2M/1G huge page mode we would
> > get the job done in about 50% faster.
> >
> > We only validate this feature on Intel x86 platform. And as Ben
> > pointed out in RFC V1, so far we disable the SMM for resource
> > consideration, drop the mmu notification as in this case the
> > memory is pinned.
> >
> > V1->V2:
> > * Rebase the code to kernel version 5.9.0-rc1.
> >
> > Yulei Zhang (9):
> >   Introduce new fields in kvm_arch/vcpu_arch struct for direct build EPT
> > support
> >   Introduce page table population function for direct build EPT feature
> >   Introduce page table remove function for direct build EPT feature
> >   Add release function for direct build ept when guest VM exit
> >   Modify the page fault path to meet the direct build EPT requirement
> >   Apply the direct build EPT according to the memory slots change
> >   Add migration support when using direct build EPT
> >   Introduce kvm module parameter global_tdp to turn on the direct build
> > EPT mode
> >   Handle certain mmu exposed functions properly while turn on direct
> > build EPT mode
> >
> >  arch/mips/kvm/mips.c|  13 +
> >  arch/powerpc/kvm/powerpc.c  |  13 +
> >  arch/s390/kvm/kvm-s390.c|  13 +
> >  arch/x86/include/asm/kvm_host.h |  13 +-
> >  arch/x86/kvm/mmu/mmu.c  | 533 ++--
> >  arch/x86/kvm/svm/svm.c  |   2 +-
> >  arch/x86/kvm/vmx/vmx.c  |   7 +-
> >  arch/x86/kvm/x86.c  |  55 ++--
> >  include/linux/kvm_host.h|   7 +-
> >  virt/kvm/kvm_main.c |  43 ++-
> >  10 files changed, 639 insertions(+), 60 deletions(-)
> >
> > --
> > 2.17.1
> >

Re: [PATCH] KVM: SVM: Add tracepoint for cr_interception

2020-09-23 Thread Wanpeng Li

On Fri, 4 Sep 2020 at 19:29, Haiwei Li  wrote:
>
> From: Haiwei Li 
>
> Add trace_kvm_cr_write and trace_kvm_cr_read for svm.
>
> Signed-off-by: Haiwei Li 

Reviewed-by: Wanpeng Li

Re: [PATCH] KVM: x86: Add kvm_x86_ops hook to short circuit emulation

2020-09-15 Thread Wanpeng Li

On Wed, 16 Sep 2020 at 07:29, Sean Christopherson
 wrote:
>
> Replace the existing kvm_x86_ops.need_emulation_on_page_fault() with a
> more generic is_emulatable(), and unconditionally call the new function
> in x86_emulate_instruction().
>
> KVM will use the generic hook to support multiple security related
> technologies that prevent emulation in one way or another.  Similar to
> the existing AMD #NPF case where emulation of the current instruction is
> not possible due to lack of information, AMD's SEV-ES and Intel's SGX
> and TDX will introduce scenarios where emulation is impossible due to
> the guest's register state being inaccessible.  And again similar to the
> existing #NPF case, emulation can be initiated by kvm_mmu_page_fault(),
> i.e. outside of the control of vendor-specific code.
>
> While the cause and architecturally visible behavior of the various
> cases are different, e.g. SGX will inject a #UD, AMD #NPF is a clean
> resume or complete shutdown, and SEV-ES and TDX "return" an error, the
> impact on the common emulation code is identical: KVM must stop
> emulation immediately and resume the guest.
>
> Query is_emulatable() in handle_ud() as well so that the
> force_emulation_prefix code doesn't incorrectly modify RIP before
> calling emulate_instruction() in the absurdly unlikely scenario that
> KVM encounters forced emulation in conjunction with "do not emulate".
>
> Cc: Tom Lendacky 
> Signed-off-by: Sean Christopherson 
> ---
>  arch/x86/include/asm/kvm_host.h |  2 +-
>  arch/x86/kvm/mmu/mmu.c  | 12 
>  arch/x86/kvm/svm/svm.c  | 31 ++-
>  arch/x86/kvm/vmx/vmx.c  | 12 ++--
>  arch/x86/kvm/x86.c  |  9 -
>  5 files changed, 33 insertions(+), 33 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 5303dbc5c9bc..fa89511ed9d6 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1221,7 +1221,7 @@ struct kvm_x86_ops {
>
> int (*get_msr_feature)(struct kvm_msr_entry *entry);
>
> -   bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu);
> +   bool (*is_emulatable)(struct kvm_vcpu *vcpu, void *insn, int 
> insn_len);
>
> bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
> int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index a5d0207e7189..f818a46db58c 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -5485,18 +5485,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t 
> cr2_or_gpa, u64 error_code,
> if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && 
> !is_guest_mode(vcpu))
> emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
>  emulate:
> -   /*
> -* On AMD platforms, under certain conditions insn_len may be zero on 
> #NPF.
> -* This can happen if a guest gets a page-fault on data access but 
> the HW
> -* table walker is not able to read the instruction page (e.g 
> instruction
> -* page is not present in memory). In those cases we simply restart 
> the
> -* guest, with the exception of AMD Erratum 1096 which is 
> unrecoverable.
> -*/
> -   if (unlikely(insn && !insn_len)) {
> -   if (!kvm_x86_ops.need_emulation_on_page_fault(vcpu))
> -   return 1;
> -   }
> -
> return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
>insn_len);
>  }
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 03dd7bac8034..3a55495d985f 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -3933,19 +3933,10 @@ static void enable_smi_window(struct kvm_vcpu *vcpu)
> }
>  }
>
> -static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
> +static bool svm_is_emulatable(struct kvm_vcpu *vcpu, void *insn, int 
> insn_len)
>  {
> -   unsigned long cr4 = kvm_read_cr4(vcpu);
> -   bool smep = cr4 & X86_CR4_SMEP;
> -   bool smap = cr4 & X86_CR4_SMAP;
> -   bool is_user = svm_get_cpl(vcpu) == 3;
> -
> -   /*
> -* If RIP is invalid, go ahead with emulation which will cause an
> -* internal error exit.
> -*/
> -   if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT))
> -   return true;
> +   bool smep, smap, is_user;
> +   unsigned long cr4;
>
> /*
>  * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
> @@ -3987,6 +3978,20 @@ static bool svm_need_emulation_on_page_fault(struct 
> kvm_vcpu *vcpu)
>  * instruction pointer so we will not able to workaround it. Lets
>  * print the error and request to kill the guest.
>  */
> +   if (likely(!insn || insn_len))
> +   return true;
> +
> +   /*
> +* If RIP is invalid, go ahead with

Re: [PATCH 2/3] KVM: SVM: Move svm_complete_interrupts() into svm_vcpu_run()

2020-09-14 Thread Wanpeng Li

On Sat, 12 Sep 2020 at 14:20, Paolo Bonzini  wrote:
>
> On 09/09/20 10:47, Wanpeng Li wrote:
> >> One more thing:
> >>
> >> VMX version does
> >>
> >> vmx_complete_interrupts(vmx);
> >> if (is_guest_mode(vcpu))
> >> return EXIT_FASTPATH_NONE;
> >>
> >> and on SVM we analyze is_guest_mode() inside
> >> svm_exit_handlers_fastpath() - should we also change that for
> >> conformity?
> >
> > Agreed, will do in v2.
>
> Please just send an incremental patch.  Thanks!

Just sent out a patch to do it. :)

Wanpeng

[PATCH] KVM: SVM: Analyze is_guest_mode() in svm_vcpu_run()

2020-09-14 Thread Wanpeng Li

From: Wanpeng Li 

Analyze is_guest_mode() in svm_vcpu_run() instead of 
svm_exit_handlers_fastpath()
in conformity with VMX version.

Suggested-by: Vitaly Kuznetsov 
Signed-off-by: Wanpeng Li 
---
 arch/x86/kvm/svm/svm.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 3da5b2f..009035a 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3393,8 +3393,7 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
 
 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
 {
-   if (!is_guest_mode(vcpu) &&
-   to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
+   if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
to_svm(vcpu)->vmcb->control.exit_info_1)
return handle_fastpath_set_msr_irqoff(vcpu);
 
@@ -3580,6 +3579,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct 
kvm_vcpu *vcpu)
svm_handle_mce(svm);
 
svm_complete_interrupts(svm);
+
+   if (is_guest_mode(vcpu))
+   return EXIT_FASTPATH_NONE;
+
exit_fastpath = svm_exit_handlers_fastpath(vcpu);
return exit_fastpath;
 }
-- 
2.7.4

[PATCH v2 3/9] KVM: LAPIC: Fix updating DFR missing apic map recalculation

2020-09-10 Thread Wanpeng Li

From: Wanpeng Li 

There is missing apic map recalculation after updating DFR, if it is
INIT RESET, in x2apic mode, local apic is software enabled before.
This patch fix it by introducing the function kvm_apic_set_dfr() to
be called in INIT RESET handling path.

Signed-off-by: Wanpeng Li 
---
 arch/x86/kvm/lapic.c | 15 ++-
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 33aab20..e446bdf 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -310,6 +310,12 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic 
*apic, u32 id)
atomic_set_release(>vcpu->kvm->arch.apic_map_dirty, DIRTY);
 }
 
+static inline void kvm_apic_set_dfr(struct kvm_lapic *apic, u32 val)
+{
+   kvm_lapic_set_reg(apic, APIC_DFR, val);
+   atomic_set_release(>vcpu->kvm->arch.apic_map_dirty, DIRTY);
+}
+
 static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
 {
return ((id >> 4) << 16) | (1 << (id & 0xf));
@@ -1984,10 +1990,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, 
u32 val)
break;
 
case APIC_DFR:
-   if (!apic_x2apic_mode(apic)) {
-   kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFF);
-   
atomic_set_release(>vcpu->kvm->arch.apic_map_dirty, DIRTY);
-   } else
+   if (!apic_x2apic_mode(apic))
+   kvm_apic_set_dfr(apic, val | 0x0FFF);
+   else
ret = 1;
break;
 
@@ -2301,7 +2306,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool 
init_event)
 SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
 
-   kvm_lapic_set_reg(apic, APIC_DFR, 0xU);
+   kvm_apic_set_dfr(apic, 0xU);
apic_set_spiv(apic, 0xff);
kvm_lapic_set_reg(apic, APIC_TASKPRI, 0);
if (!apic_x2apic_mode(apic))
-- 
2.7.4

[PATCH v2 4/9] KVM: VMX: Don't freeze guest when event delivery causes an APIC-access exit

2020-09-10 Thread Wanpeng Li

From: Wanpeng Li 

According to SDM 27.2.4, Event delivery causes an APIC-access VM exit.
Don't report internal error and freeze guest when event delivery causes
an APIC-access exit, it is handleable and the event will be re-injected
during the next vmentry.

Signed-off-by: Wanpeng Li 
---
 arch/x86/kvm/vmx/vmx.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 819c185..a544351 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6054,6 +6054,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, 
fastpath_t exit_fastpath)
(exit_reason != EXIT_REASON_EXCEPTION_NMI &&
exit_reason != EXIT_REASON_EPT_VIOLATION &&
exit_reason != EXIT_REASON_PML_FULL &&
+   exit_reason != EXIT_REASON_APIC_ACCESS &&
exit_reason != EXIT_REASON_TASK_SWITCH)) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
-- 
2.7.4

[PATCH v2 8/9] KVM: SVM: Move svm_complete_interrupts() into svm_vcpu_run()

2020-09-10 Thread Wanpeng Li

From: Wanpeng Li 

Moving svm_complete_interrupts() into svm_vcpu_run() which can align VMX
and SVM with respect to completing interrupts.

Suggested-by: Sean Christopherson 
Reviewed-by: Vitaly Kuznetsov 
Cc: Paul K. 
Signed-off-by: Wanpeng Li 
---
 arch/x86/kvm/svm/svm.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index c61bc3b..dafc14d 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -2938,8 +2938,6 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t 
exit_fastpath)
if (npt_enabled)
vcpu->arch.cr3 = svm->vmcb->save.cr3;
 
-   svm_complete_interrupts(svm);
-
if (is_guest_mode(vcpu)) {
int vmexit;
 
@@ -3530,6 +3528,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu 
*vcpu)
 SVM_EXIT_EXCP_BASE + MC_VECTOR))
svm_handle_mce(svm);
 
+   svm_complete_interrupts(svm);
vmcb_mark_all_clean(svm->vmcb);
return exit_fastpath;
 }
-- 
2.7.4

[PATCH v2 9/9] KVM: SVM: Reenable handle_fastpath_set_msr_irqoff() after complete_interrupts()

2020-09-10 Thread Wanpeng Li

From: Wanpeng Li 

Moving the call to svm_exit_handlers_fastpath() after svm_complete_interrupts()
since svm_complete_interrupts() consumes rip and reenable the function
handle_fastpath_set_msr_irqoff() call in svm_exit_handlers_fastpath().

Suggested-by: Sean Christopherson 
Reviewed-by: Vitaly Kuznetsov 
Cc: Paul K. 
Signed-off-by: Wanpeng Li 
---
 arch/x86/kvm/svm/svm.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index dafc14d..b3e3429 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3347,6 +3347,10 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
 
 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
 {
+   if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
+   to_svm(vcpu)->vmcb->control.exit_info_1)
+   return handle_fastpath_set_msr_irqoff(vcpu);
+
return EXIT_FASTPATH_NONE;
 }
 
@@ -3495,7 +3499,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu 
*vcpu)
stgi();
 
/* Any pending NMI will happen here */
-   exit_fastpath = svm_exit_handlers_fastpath(vcpu);
 
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
kvm_after_interrupt(>vcpu);
@@ -3530,6 +3533,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct 
kvm_vcpu *vcpu)
 
svm_complete_interrupts(svm);
vmcb_mark_all_clean(svm->vmcb);
+
+   if (is_guest_mode(vcpu))
+   return EXIT_FASTPATH_NONE;
+
+   exit_fastpath = svm_exit_handlers_fastpath(vcpu);
+
return exit_fastpath;
 }
 
-- 
2.7.4

[PATCH v2 5/9] KVM: LAPIC: Narrow down the kick target vCPU

2020-09-10 Thread Wanpeng Li

From: Wanpeng Li 

The kick after setting KVM_REQ_PENDING_TIMER is used to handle the timer
fires on a different pCPU which vCPU is running on, this kick is expensive
since memory barrier, rcu, preemption disable/enable operations. We don't
need this kick when injecting already-expired timer, we also should call
out the VMX preemption timer case, which also passes from_timer_fn=false
but doesn't need a kick because kvm_lapic_expired_hv_timer() is called
from the target vCPU.

Reviewed-by: Sean Christopherson 
Signed-off-by: Wanpeng Li 
---
 arch/x86/kvm/lapic.c | 4 +++-
 arch/x86/kvm/x86.c   | 6 --
 arch/x86/kvm/x86.h   | 1 -
 3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index e446bdf..3b32d3b 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1642,7 +1642,9 @@ static void apic_timer_expired(struct kvm_lapic *apic, 
bool from_timer_fn)
}
 
atomic_inc(>lapic_timer.pending);
-   kvm_set_pending_timer(vcpu);
+   kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+   if (from_timer_fn)
+   kvm_vcpu_kick(vcpu);
 }
 
 static void start_sw_tscdeadline(struct kvm_lapic *apic)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d39d6cf..dcf4494 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1774,12 +1774,6 @@ static s64 get_kvmclock_base_ns(void)
 }
 #endif
 
-void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
-{
-   kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
-   kvm_vcpu_kick(vcpu);
-}
-
 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 {
int version;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 995ab69..ea20b8b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -246,7 +246,6 @@ static inline bool kvm_vcpu_latch_init(struct kvm_vcpu 
*vcpu)
return is_smm(vcpu) || kvm_x86_ops.apic_init_signal_blocked(vcpu);
 }
 
-void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
 void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int 
inc_eip);
 
 void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
-- 
2.7.4

[PATCH v2 7/9] KVM: SVM: Get rid of handle_fastpath_set_msr_irqoff()

2020-09-10 Thread Wanpeng Li

From: Wanpeng Li 

Analysis from Sean:

 | svm->next_rip is reset in svm_vcpu_run() only after calling
 | svm_exit_handlers_fastpath(), which will cause SVM's
 | skip_emulated_instruction() to write a stale RIP.

Let's get rid of handle_fastpath_set_msr_irqoff() in 
svm_exit_handlers_fastpath()
to have a quick fix.

Reported-by: Paul K. 
Suggested-by: Sean Christopherson 
Reviewed-by: Vitaly Kuznetsov 
Cc: Paul K. 
Cc:  # v5.8-rc1+
Fixes: 404d5d7bff0d (KVM: X86: Introduce more exit_fastpath_completion enum 
values)
Signed-off-by: Wanpeng Li 
---
 arch/x86/kvm/svm/svm.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 19e622a..c61bc3b 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3349,11 +3349,6 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
 
 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
 {
-   if (!is_guest_mode(vcpu) &&
-   to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
-   to_svm(vcpu)->vmcb->control.exit_info_1)
-   return handle_fastpath_set_msr_irqoff(vcpu);
-
return EXIT_FASTPATH_NONE;
 }
 
-- 
2.7.4

[PATCH v2 6/9] KVM: LAPIC: Reduce world switch latency caused by timer_advance_ns

2020-09-10 Thread Wanpeng Li

From: Wanpeng Li 

All the checks in lapic_timer_int_injected(), __kvm_wait_lapic_expire(), and
these function calls waste cpu cycles when the timer mode is not tscdeadline.
We can observe ~1.3% world switch time overhead by kvm-unit-tests/vmexit.flat
vmcall testing on AMD server. This patch reduces the world switch latency
caused by timer_advance_ns feature when the timer mode is not tscdeadline by
simpling move the check against apic->lapic_timer.expired_tscdeadline much
earlier.

Signed-off-by: Wanpeng Li 
---
 arch/x86/kvm/lapic.c   | 11 +--
 arch/x86/kvm/svm/svm.c |  4 +---
 arch/x86/kvm/vmx/vmx.c |  4 +---
 3 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 3b32d3b..51ed4f0 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1582,9 +1582,6 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
struct kvm_lapic *apic = vcpu->arch.apic;
u64 guest_tsc, tsc_deadline;
 
-   if (apic->lapic_timer.expired_tscdeadline == 0)
-   return;
-
tsc_deadline = apic->lapic_timer.expired_tscdeadline;
apic->lapic_timer.expired_tscdeadline = 0;
guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
@@ -1599,7 +1596,10 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu 
*vcpu)
 
 void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
 {
-   if (lapic_timer_int_injected(vcpu))
+   if (lapic_in_kernel(vcpu) &&
+   vcpu->arch.apic->lapic_timer.expired_tscdeadline &&
+   vcpu->arch.apic->lapic_timer.timer_advance_ns &&
+   lapic_timer_int_injected(vcpu))
__kvm_wait_lapic_expire(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire);
@@ -1635,8 +1635,7 @@ static void apic_timer_expired(struct kvm_lapic *apic, 
bool from_timer_fn)
}
 
if (kvm_use_posted_timer_interrupt(apic->vcpu)) {
-   if (apic->lapic_timer.timer_advance_ns)
-   __kvm_wait_lapic_expire(vcpu);
+   kvm_wait_lapic_expire(vcpu);
kvm_apic_inject_pending_timer_irqs(apic);
return;
}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 0194336..19e622a 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3456,9 +3456,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu 
*vcpu)
clgi();
kvm_load_guest_xsave_state(vcpu);
 
-   if (lapic_in_kernel(vcpu) &&
-   vcpu->arch.apic->lapic_timer.timer_advance_ns)
-   kvm_wait_lapic_expire(vcpu);
+   kvm_wait_lapic_expire(vcpu);
 
/*
 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index a544351..d6e1656 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6800,9 +6800,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (enable_preemption_timer)
vmx_update_hv_timer(vcpu);
 
-   if (lapic_in_kernel(vcpu) &&
-   vcpu->arch.apic->lapic_timer.timer_advance_ns)
-   kvm_wait_lapic_expire(vcpu);
+   kvm_wait_lapic_expire(vcpu);
 
/*
 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
-- 
2.7.4

[PATCH v2 2/9] KVM: LAPIC: Guarantee the timer is in tsc-deadline mode when setting

2020-09-10 Thread Wanpeng Li

From: Wanpeng Li 

Check apic_lvtt_tscdeadline() mode directly instead of apic_lvtt_oneshot()
and apic_lvtt_period() to guarantee the timer is in tsc-deadline mode when
wrmsr MSR_IA32_TSCDEADLINE.

Reviewed-by: Sean Christopherson 
Signed-off-by: Wanpeng Li 
---
 arch/x86/kvm/lapic.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 81bf6a8..33aab20 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2193,8 +2193,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, 
u64 data)
 {
struct kvm_lapic *apic = vcpu->arch.apic;
 
-   if (!kvm_apic_present(vcpu) || apic_lvtt_oneshot(apic) ||
-   apic_lvtt_period(apic))
+   if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic))
return;
 
hrtimer_cancel(>lapic_timer.timer);
-- 
2.7.4

[PATCH v2 1/9] KVM: LAPIC: Return 0 when getting the tscdeadline timer if the lapic is hw disabled

2020-09-10 Thread Wanpeng Li

From: Wanpeng Li 

Return 0 when getting the tscdeadline timer if the lapic is hw disabled.

Suggested-by: Paolo Bonzini 
Reviewed-by: Sean Christopherson 
Signed-off-by: Wanpeng Li 
---
 arch/x86/kvm/lapic.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 35cca2e..81bf6a8 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2183,8 +2183,7 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
 {
struct kvm_lapic *apic = vcpu->arch.apic;
 
-   if (!lapic_in_kernel(vcpu) ||
-   !apic_lvtt_tscdeadline(apic))
+   if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic))
return 0;
 
return apic->lapic_timer.tscdeadline;
-- 
2.7.4

[PATCH v2 0/9] KVM: collect sporadic patches

2020-09-10 Thread Wanpeng Li

Collect sporadic patches for easy apply.

Wanpeng Li (9):
  KVM: LAPIC: Return 0 when getting the tscdeadline timer if the lapic
is hw disabled
  KVM: LAPIC: Guarantee the timer is in tsc-deadline mode when setting
  KVM: LAPIC: Fix updating DFR missing apic map recalculation
  KVM: VMX: Don't freeze guest when event delivery causes an APIC-access
exit
  KVM: LAPIC: Narrow down the kick target vCPU
  KVM: LAPIC: Reduce world switch latency caused by timer_advance_ns
  KVM: SVM: Get rid of handle_fastpath_set_msr_irqoff()
  KVM: SVM: Move svm_complete_interrupts() into svm_vcpu_run()
  KVM: SVM: Reenable handle_fastpath_set_msr_irqoff() after
complete_interrupts()

 arch/x86/kvm/lapic.c   | 36 
 arch/x86/kvm/svm/svm.c | 17 +
 arch/x86/kvm/vmx/vmx.c |  5 ++---
 arch/x86/kvm/x86.c |  6 --
 arch/x86/kvm/x86.h |  1 -
 5 files changed, 31 insertions(+), 34 deletions(-)

-- 
2.7.4

Re: [PATCH 2/3] KVM: SVM: Move svm_complete_interrupts() into svm_vcpu_run()

2020-09-09 Thread Wanpeng Li

On Wed, 9 Sep 2020 at 16:36, Vitaly Kuznetsov  wrote:
>
> Wanpeng Li  writes:
>
> > From: Wanpeng Li 
> >
> > Moving svm_complete_interrupts() into svm_vcpu_run() which can align VMX
> > and SVM with respect to completing interrupts.
> >
> > Suggested-by: Sean Christopherson 
> > Cc: Paul K. 
> > Signed-off-by: Wanpeng Li 
> > ---
> >  arch/x86/kvm/svm/svm.c | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> > index c61bc3b..74bcf0a 100644
> > --- a/arch/x86/kvm/svm/svm.c
> > +++ b/arch/x86/kvm/svm/svm.c
> > @@ -2938,8 +2938,6 @@ static int handle_exit(struct kvm_vcpu *vcpu, 
> > fastpath_t exit_fastpath)
> >   if (npt_enabled)
> >   vcpu->arch.cr3 = svm->vmcb->save.cr3;
> >
> > - svm_complete_interrupts(svm);
> > -
> >   if (is_guest_mode(vcpu)) {
> >   int vmexit;
> >
> > @@ -3530,6 +3528,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct 
> > kvm_vcpu *vcpu)
> >SVM_EXIT_EXCP_BASE + MC_VECTOR))
> >   svm_handle_mce(svm);
> >
> > + svm_complete_interrupts(svm);
> > +
> >   vmcb_mark_all_clean(svm->vmcb);
> >   return exit_fastpath;
> >  }
>
> This seems to be the right thing to do, however, the amount of code
> between kvm_x86_ops.run() and kvm_x86_ops.handle_exit() is non-trivial,
> hope it won't blow up in testing...
>
> Reviewed-by: Vitaly Kuznetsov 
>
> One more thing:
>
> VMX version does
>
> vmx_complete_interrupts(vmx);
> if (is_guest_mode(vcpu))
> return EXIT_FASTPATH_NONE;
>
> and on SVM we analyze is_guest_mode() inside
> svm_exit_handlers_fastpath() - should we also change that for
> conformity?

Agreed, will do in v2.

Wanpeng

Re: [PATCH 1/2] KVM: LAPIC: Fix updating DFR missing apic map recalculation

2020-09-09 Thread Wanpeng Li

Any Reviewed-by for these two patches? :)
On Wed, 19 Aug 2020 at 16:55, Wanpeng Li  wrote:
>
> From: Wanpeng Li 
>
> There is missing apic map recalculation after updating DFR, if it is
> INIT RESET, in x2apic mode, local apic is software enabled before.
> This patch fix it by introducing the function kvm_apic_set_dfr() to
> be called in INIT RESET handling path.
>
> Signed-off-by: Wanpeng Li 
> ---
>  arch/x86/kvm/lapic.c | 15 ++-
>  1 file changed, 10 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 5ccbee7..248095a 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -310,6 +310,12 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic 
> *apic, u32 id)
> atomic_set_release(>vcpu->kvm->arch.apic_map_dirty, DIRTY);
>  }
>
> +static inline void kvm_apic_set_dfr(struct kvm_lapic *apic, u32 val)
> +{
> +   kvm_lapic_set_reg(apic, APIC_DFR, val);
> +   atomic_set_release(>vcpu->kvm->arch.apic_map_dirty, DIRTY);
> +}
> +
>  static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
>  {
> return ((id >> 4) << 16) | (1 << (id & 0xf));
> @@ -1984,10 +1990,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 
> reg, u32 val)
> break;
>
> case APIC_DFR:
> -   if (!apic_x2apic_mode(apic)) {
> -   kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFF);
> -   
> atomic_set_release(>vcpu->kvm->arch.apic_map_dirty, DIRTY);
> -   } else
> +   if (!apic_x2apic_mode(apic))
> +   kvm_apic_set_dfr(apic, val | 0x0FFF);
> +   else
> ret = 1;
> break;
>
> @@ -2303,7 +2308,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool 
> init_event)
>  SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
> apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
>
> -   kvm_lapic_set_reg(apic, APIC_DFR, 0xU);
> +   kvm_apic_set_dfr(apic, 0xU);
> apic_set_spiv(apic, 0xff);
> kvm_lapic_set_reg(apic, APIC_TASKPRI, 0);
> if (!apic_x2apic_mode(apic))
> --
> 2.7.4
>

Re: [PATCH 1/3] KVM: SVM: Get rid of handle_fastpath_set_msr_irqoff()

2020-09-09 Thread Wanpeng Li

On Wed, 9 Sep 2020 at 16:23, Vitaly Kuznetsov  wrote:
>
> Wanpeng Li  writes:
>
> > From: Wanpeng Li 
> >
> > Analysis from Sean:
> >
> >  | svm->next_rip is reset in svm_vcpu_run() only after calling
> >  | svm_exit_handlers_fastpath(), which will cause SVM's
> >  | skip_emulated_instruction() to write a stale RIP.
> >
>
> This should only happen when svm->vmcb->control.next_rip is not set by
> hardware as skip_emulated_instruction() itself sets 'svm->next_rip'
> otherwise, right?

The bug is reported here
https://bugzilla.kernel.org/show_bug.cgi?id=209155 , the old machine
which the reporter uses doesn't have NRIP save on #VMEXIT support. :)

Wanpeng

Re: [RFC V2 0/9] x86/mmu:Introduce parallel memory virtualization to boost performance

2020-09-08 Thread Wanpeng Li

Any comments? guys!
On Tue, 1 Sep 2020 at 19:52,  wrote:
>
> From: Yulei Zhang 
>
> Currently in KVM memory virtulization we relay on mmu_lock to
> synchronize the memory mapping update, which make vCPUs work
> in serialize mode and slow down the execution, especially after
> migration to do substantial memory mapping will cause visible
> performance drop, and it can get worse if guest has more vCPU
> numbers and memories.
>
> The idea we present in this patch set is to mitigate the issue
> with pre-constructed memory mapping table. We will fast pin the
> guest memory to build up a global memory mapping table according
> to the guest memslots changes and apply it to cr3, so that after
> guest starts up all the vCPUs would be able to update the memory
> simultaneously without page fault exception, thus the performance
> improvement is expected.
>
> We use memory dirty pattern workload to test the initial patch
> set and get positive result even with huge page enabled. For example,
> we create guest with 32 vCPUs and 64G memories, and let the vcpus
> dirty the entire memory region concurrently, as the initial patch
> eliminate the overhead of mmu_lock, in 2M/1G huge page mode we would
> get the job done in about 50% faster.
>
> We only validate this feature on Intel x86 platform. And as Ben
> pointed out in RFC V1, so far we disable the SMM for resource
> consideration, drop the mmu notification as in this case the
> memory is pinned.
>
> V1->V2:
> * Rebase the code to kernel version 5.9.0-rc1.
>
> Yulei Zhang (9):
>   Introduce new fields in kvm_arch/vcpu_arch struct for direct build EPT
> support
>   Introduce page table population function for direct build EPT feature
>   Introduce page table remove function for direct build EPT feature
>   Add release function for direct build ept when guest VM exit
>   Modify the page fault path to meet the direct build EPT requirement
>   Apply the direct build EPT according to the memory slots change
>   Add migration support when using direct build EPT
>   Introduce kvm module parameter global_tdp to turn on the direct build
> EPT mode
>   Handle certain mmu exposed functions properly while turn on direct
> build EPT mode
>
>  arch/mips/kvm/mips.c|  13 +
>  arch/powerpc/kvm/powerpc.c  |  13 +
>  arch/s390/kvm/kvm-s390.c|  13 +
>  arch/x86/include/asm/kvm_host.h |  13 +-
>  arch/x86/kvm/mmu/mmu.c  | 533 ++--
>  arch/x86/kvm/svm/svm.c  |   2 +-
>  arch/x86/kvm/vmx/vmx.c  |   7 +-
>  arch/x86/kvm/x86.c  |  55 ++--
>  include/linux/kvm_host.h|   7 +-
>  virt/kvm/kvm_main.c |  43 ++-
>  10 files changed, 639 insertions(+), 60 deletions(-)
>
> --
> 2.17.1
>

[PATCH RESEND 3/3] KVM: SVM: Reenable handle_fastpath_set_msr_irqoff() after complete_interrupts()

2020-09-08 Thread Wanpeng Li

From: Wanpeng Li 

Moving the call to svm_exit_handlers_fastpath() after svm_complete_interrupts() 
since svm_complete_interrupts() consumes rip and reenable the function 
handle_fastpath_set_msr_irqoff() call in svm_exit_handlers_fastpath().

Suggested-by: Sean Christopherson 
Cc: Paul K. 
Signed-off-by: Wanpeng Li 
---
 arch/x86/kvm/svm/svm.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 74bcf0a..ac819f0 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3347,6 +3347,11 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
 
 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
 {
+   if (!is_guest_mode(vcpu) &&
+   to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
+   to_svm(vcpu)->vmcb->control.exit_info_1)
+   return handle_fastpath_set_msr_irqoff(vcpu);
+
return EXIT_FASTPATH_NONE;
 }
 
@@ -3495,7 +3500,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu 
*vcpu)
stgi();
 
/* Any pending NMI will happen here */
-   exit_fastpath = svm_exit_handlers_fastpath(vcpu);
 
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
kvm_after_interrupt(>vcpu);
@@ -3529,6 +3533,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu 
*vcpu)
svm_handle_mce(svm);
 
svm_complete_interrupts(svm);
+   exit_fastpath = svm_exit_handlers_fastpath(vcpu);
 
vmcb_mark_all_clean(svm->vmcb);
return exit_fastpath;
-- 
2.7.4

[PATCH RESEND 2/3] KVM: SVM: Move svm_complete_interrupts() into svm_vcpu_run()

2020-09-08 Thread Wanpeng Li

From: Wanpeng Li 

Moving svm_complete_interrupts() into svm_vcpu_run() which can align VMX 
and SVM with respect to completing interrupts.

Suggested-by: Sean Christopherson 
Cc: Paul K. 
Signed-off-by: Wanpeng Li 
---
 arch/x86/kvm/svm/svm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index c61bc3b..74bcf0a 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -2938,8 +2938,6 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t 
exit_fastpath)
if (npt_enabled)
vcpu->arch.cr3 = svm->vmcb->save.cr3;
 
-   svm_complete_interrupts(svm);
-
if (is_guest_mode(vcpu)) {
int vmexit;
 
@@ -3530,6 +3528,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu 
*vcpu)
 SVM_EXIT_EXCP_BASE + MC_VECTOR))
svm_handle_mce(svm);
 
+   svm_complete_interrupts(svm);
+
vmcb_mark_all_clean(svm->vmcb);
return exit_fastpath;
 }
-- 
2.7.4

[PATCH RESEND 1/3] KVM: SVM: Get rid of handle_fastpath_set_msr_irqoff()

2020-09-08 Thread Wanpeng Li

From: Wanpeng Li 

Analysis from Sean:

 | svm->next_rip is reset in svm_vcpu_run() only after calling 
 | svm_exit_handlers_fastpath(), which will cause SVM's 
 | skip_emulated_instruction() to write a stale RIP.
 
Let's get rid of handle_fastpath_set_msr_irqoff() in 
svm_exit_handlers_fastpath()
to have a quick fix.

Reported-by: Paul K. 
Suggested-by: Sean Christopherson 
Cc: Paul K. 
Cc:  # v5.8-rc1+
Fixes: 404d5d7bff0d (KVM: X86: Introduce more exit_fastpath_completion enum 
values)
Signed-off-by: Wanpeng Li 
---
 arch/x86/kvm/svm/svm.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 19e622a..c61bc3b 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3349,11 +3349,6 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
 
 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
 {
-   if (!is_guest_mode(vcpu) &&
-   to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
-   to_svm(vcpu)->vmcb->control.exit_info_1)
-   return handle_fastpath_set_msr_irqoff(vcpu);
-
return EXIT_FASTPATH_NONE;
 }
 
-- 
2.7.4

[PATCH 2/3] KVM: SVM: Move svm_complete_interrupts() into svm_vcpu_run()

2020-09-08 Thread Wanpeng Li

From: Wanpeng Li 

Moving svm_complete_interrupts() into svm_vcpu_run() which can align VMX 
and SVM with respect to completing interrupts.

Suggested-by: Sean Christopherson 
Cc: Paul K. 
Signed-off-by: Wanpeng Li 
---
 arch/x86/kvm/svm/svm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index c61bc3b..74bcf0a 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -2938,8 +2938,6 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t 
exit_fastpath)
if (npt_enabled)
vcpu->arch.cr3 = svm->vmcb->save.cr3;
 
-   svm_complete_interrupts(svm);
-
if (is_guest_mode(vcpu)) {
int vmexit;
 
@@ -3530,6 +3528,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu 
*vcpu)
 SVM_EXIT_EXCP_BASE + MC_VECTOR))
svm_handle_mce(svm);
 
+   svm_complete_interrupts(svm);
+
vmcb_mark_all_clean(svm->vmcb);
return exit_fastpath;
 }
-- 
2.7.4

[PATCH 3/3] KVM: SVM: Reenable handle_fastpath_set_msr_irqoff() after complete_interrupts()

2020-09-08 Thread Wanpeng Li

From: Wanpeng Li 

Moving the call to svm_exit_handlers_fastpath() after svm_complete_interrupts() 
since svm_complete_interrupts() consumes rip and reenable the function 
handle_fastpath_set_msr_irqoff() call in svm_exit_handlers_fastpath().

Suggested-by: Sean Christopherson 
Cc: Paul K. 
Signed-off-by: Wanpeng Li 
---
 arch/x86/kvm/svm/svm.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 74bcf0a..ac819f0 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3347,6 +3347,11 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
 
 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
 {
+   if (!is_guest_mode(vcpu) &&
+   to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
+   to_svm(vcpu)->vmcb->control.exit_info_1)
+   return handle_fastpath_set_msr_irqoff(vcpu);
+
return EXIT_FASTPATH_NONE;
 }
 
@@ -3495,7 +3500,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu 
*vcpu)
stgi();
 
/* Any pending NMI will happen here */
-   exit_fastpath = svm_exit_handlers_fastpath(vcpu);
 
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
kvm_after_interrupt(>vcpu);
@@ -3529,6 +3533,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu 
*vcpu)
svm_handle_mce(svm);
 
svm_complete_interrupts(svm);
+   exit_fastpath = svm_exit_handlers_fastpath(vcpu);
 
vmcb_mark_all_clean(svm->vmcb);
return exit_fastpath;
-- 
2.7.4

[PATCH 1/3] KVM: SVM: Get rid of handle_fastpath_set_msr_irqoff()

2020-09-08 Thread Wanpeng Li

From: Wanpeng Li 

Analysis from Sean:

 | svm->next_rip is reset in svm_vcpu_run() only after calling 
 | svm_exit_handlers_fastpath(), which will cause SVM's 
 | skip_emulated_instruction() to write a stale RIP.
 
Let's get rid of handle_fastpath_set_msr_irqoff() in 
svm_exit_handlers_fastpath()
to have a quick fix.

Reported-by: Paul K. 
Suggested-by: Sean Christopherson 
Cc: Paul K. 
Cc:  # v5.8-rc1+
Fixes: 404d5d7bff0d (KVM: X86: Introduce more exit_fastpath_completion enum 
values)
Signed-off-by: Wanpeng Li 
---
 arch/x86/kvm/svm/svm.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 19e622a..c61bc3b 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3349,11 +3349,6 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
 
 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
 {
-   if (!is_guest_mode(vcpu) &&
-   to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
-   to_svm(vcpu)->vmcb->control.exit_info_1)
-   return handle_fastpath_set_msr_irqoff(vcpu);
-
return EXIT_FASTPATH_NONE;
 }
 
-- 
2.7.4

[PATCH v2] KVM: LAPIC: Reduce world switch latency caused by timer_advance_ns

2020-09-08 Thread Wanpeng Li

From: Wanpeng Li 

All the checks in lapic_timer_int_injected(), __kvm_wait_lapic_expire(), and 
these function calls waste cpu cycles when the timer mode is not tscdeadline. 
We can observe ~1.3% world switch time overhead by kvm-unit-tests/vmexit.flat 
vmcall testing on AMD server. This patch reduces the world switch latency 
caused by timer_advance_ns feature when the timer mode is not tscdeadline by 
simpling move the check against apic->lapic_timer.expired_tscdeadline much 
earlier.

Signed-off-by: Wanpeng Li 
---
v1 -> v2:
 * move the check against apic->lapic_timer.expired_tscdeadline much earlier 
   instead of reset timer_advance_ns

 arch/x86/kvm/lapic.c   | 11 +--
 arch/x86/kvm/svm/svm.c |  4 +---
 arch/x86/kvm/vmx/vmx.c |  4 +---
 3 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 3b32d3b..51ed4f0 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1582,9 +1582,6 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
struct kvm_lapic *apic = vcpu->arch.apic;
u64 guest_tsc, tsc_deadline;
 
-   if (apic->lapic_timer.expired_tscdeadline == 0)
-   return;
-
tsc_deadline = apic->lapic_timer.expired_tscdeadline;
apic->lapic_timer.expired_tscdeadline = 0;
guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
@@ -1599,7 +1596,10 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu 
*vcpu)
 
 void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
 {
-   if (lapic_timer_int_injected(vcpu))
+   if (lapic_in_kernel(vcpu) &&
+   vcpu->arch.apic->lapic_timer.expired_tscdeadline &&
+   vcpu->arch.apic->lapic_timer.timer_advance_ns &&
+   lapic_timer_int_injected(vcpu))
__kvm_wait_lapic_expire(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire);
@@ -1635,8 +1635,7 @@ static void apic_timer_expired(struct kvm_lapic *apic, 
bool from_timer_fn)
}
 
if (kvm_use_posted_timer_interrupt(apic->vcpu)) {
-   if (apic->lapic_timer.timer_advance_ns)
-   __kvm_wait_lapic_expire(vcpu);
+   kvm_wait_lapic_expire(vcpu);
kvm_apic_inject_pending_timer_irqs(apic);
return;
}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 0194336..19e622a 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3456,9 +3456,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu 
*vcpu)
clgi();
kvm_load_guest_xsave_state(vcpu);
 
-   if (lapic_in_kernel(vcpu) &&
-   vcpu->arch.apic->lapic_timer.timer_advance_ns)
-   kvm_wait_lapic_expire(vcpu);
+   kvm_wait_lapic_expire(vcpu);
 
/*
 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index a544351..d6e1656 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6800,9 +6800,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (enable_preemption_timer)
vmx_update_hv_timer(vcpu);
 
-   if (lapic_in_kernel(vcpu) &&
-   vcpu->arch.apic->lapic_timer.timer_advance_ns)
-   kvm_wait_lapic_expire(vcpu);
+   kvm_wait_lapic_expire(vcpu);
 
/*
 * If this vCPU has touched SPEC_CTRL, restore the guest's value if
-- 
2.7.4

Re: [PATCH] KVM: LAPIC: Reset timer_advance_ns if timer mode switch

2020-09-03 Thread Wanpeng Li

On Thu, 3 Sep 2020 at 05:23, Sean Christopherson
 wrote:
>
> On Fri, Aug 28, 2020 at 09:35:08AM +0800, Wanpeng Li wrote:
> > From: Wanpeng Li 
> >
> > per-vCPU timer_advance_ns should be set to 0 if timer mode is not 
> > tscdeadline
> > otherwise we waste cpu cycles in the function lapic_timer_int_injected(),
> > especially on AMD platform which doesn't support tscdeadline mode. We can
> > reset timer_advance_ns to the initial value if switch back to tscdealine
> > timer mode.
> >
> > Signed-off-by: Wanpeng Li 
> > ---
> >  arch/x86/kvm/lapic.c | 6 ++
> >  1 file changed, 6 insertions(+)
> >
> > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > index 654649b..abc296d 100644
> > --- a/arch/x86/kvm/lapic.c
> > +++ b/arch/x86/kvm/lapic.c
> > @@ -1499,10 +1499,16 @@ static void apic_update_lvtt(struct kvm_lapic *apic)
> >   kvm_lapic_set_reg(apic, APIC_TMICT, 0);
> >   apic->lapic_timer.period = 0;
> >   apic->lapic_timer.tscdeadline = 0;
> > + if (timer_mode == APIC_LVT_TIMER_TSCDEADLINE &&
> > + lapic_timer_advance_dynamic)
>
> Bad indentation.
>
> > + apic->lapic_timer.timer_advance_ns = 
> > LAPIC_TIMER_ADVANCE_NS_INIT;
>
> Redoing the tuning seems odd.  Doubt it will matter, but it feels weird to
> have to retune the advancement just because the guest toggled between modes.
>
> Rather than clear timer_advance_ns, can we simply move the check against
> apic->lapic_timer.expired_tscdeadline much earlier?  I think that would
> solve this performance hiccup, and IMO would be a logical change in any
> case.  E.g. with some refactoring to avoid more duplication between VMX and
> SVM

How about something like below:

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 3b32d3b..51ed4f0 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1582,9 +1582,6 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
 struct kvm_lapic *apic = vcpu->arch.apic;
 u64 guest_tsc, tsc_deadline;

-if (apic->lapic_timer.expired_tscdeadline == 0)
-return;
-
 tsc_deadline = apic->lapic_timer.expired_tscdeadline;
 apic->lapic_timer.expired_tscdeadline = 0;
 guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
@@ -1599,7 +1596,10 @@ static void __kvm_wait_lapic_expire(struct
kvm_vcpu *vcpu)

 void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
 {
-if (lapic_timer_int_injected(vcpu))
+if (lapic_in_kernel(vcpu) &&
+vcpu->arch.apic->lapic_timer.expired_tscdeadline &&
+vcpu->arch.apic->lapic_timer.timer_advance_ns &&
+lapic_timer_int_injected(vcpu))
 __kvm_wait_lapic_expire(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire);
@@ -1635,8 +1635,7 @@ static void apic_timer_expired(struct kvm_lapic
*apic, bool from_timer_fn)
 }

 if (kvm_use_posted_timer_interrupt(apic->vcpu)) {
-if (apic->lapic_timer.timer_advance_ns)
-__kvm_wait_lapic_expire(vcpu);
+kvm_wait_lapic_expire(vcpu);
 kvm_apic_inject_pending_timer_irqs(apic);
 return;
 }
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 0194336..19e622a 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3456,9 +3456,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct
kvm_vcpu *vcpu)
 clgi();
 kvm_load_guest_xsave_state(vcpu);

-if (lapic_in_kernel(vcpu) &&
-vcpu->arch.apic->lapic_timer.timer_advance_ns)
-kvm_wait_lapic_expire(vcpu);
+kvm_wait_lapic_expire(vcpu);

 /*
  * If this vCPU has touched SPEC_CTRL, restore the guest's value if
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index a544351..d6e1656 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6800,9 +6800,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
 if (enable_preemption_timer)
 vmx_update_hv_timer(vcpu);

-if (lapic_in_kernel(vcpu) &&
-vcpu->arch.apic->lapic_timer.timer_advance_ns)
-kvm_wait_lapic_expire(vcpu);
+kvm_wait_lapic_expire(vcpu);

 /*
  * If this vCPU has touched SPEC_CTRL, restore the guest's value if

Re: [PATCH] KVM: LAPIC: Reset timer_advance_ns if timer mode switch

2020-08-31 Thread Wanpeng Li

On Mon, 31 Aug 2020 at 20:48, Vitaly Kuznetsov  wrote:
>
> Wanpeng Li  writes:
>
> > From: Wanpeng Li 
> >
> > per-vCPU timer_advance_ns should be set to 0 if timer mode is not 
> > tscdeadline
> > otherwise we waste cpu cycles in the function lapic_timer_int_injected(),
>
> lapic_timer_int_injected is just a test, kvm_wait_lapic_expire()
> (__kvm_wait_lapic_expire()) maybe?

Both the check in lapic_timer_int_injected(), the check in
__kvm_wait_lapic_expire(), and these function calls, we can observe
~1.3% world switch time reduce w/ this patch by
kvm-unit-tests/vmexit.flat vmcall testing on AMD server. In addition,
I think we should set apic->lapic_timer.expired_tscdeadline to 0 when
switching between tscdeadline mode and other modes on Intel in order
that we will not waste cpu cycles to tune advance value in
adjust_lapic_timer_advance() for one time.

Wanpeng

[PATCH] KVM: LAPIC: Reset timer_advance_ns if timer mode switch

2020-08-27 Thread Wanpeng Li

From: Wanpeng Li 

per-vCPU timer_advance_ns should be set to 0 if timer mode is not tscdeadline 
otherwise we waste cpu cycles in the function lapic_timer_int_injected(), 
especially on AMD platform which doesn't support tscdeadline mode. We can 
reset timer_advance_ns to the initial value if switch back to tscdealine 
timer mode.

Signed-off-by: Wanpeng Li 
---
 arch/x86/kvm/lapic.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 654649b..abc296d 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1499,10 +1499,16 @@ static void apic_update_lvtt(struct kvm_lapic *apic)
kvm_lapic_set_reg(apic, APIC_TMICT, 0);
apic->lapic_timer.period = 0;
apic->lapic_timer.tscdeadline = 0;
+   if (timer_mode == APIC_LVT_TIMER_TSCDEADLINE &&
+   lapic_timer_advance_dynamic)
+   apic->lapic_timer.timer_advance_ns = 
LAPIC_TIMER_ADVANCE_NS_INIT;
}
apic->lapic_timer.timer_mode = timer_mode;
limit_periodic_timer_frequency(apic);
}
+   if (timer_mode != APIC_LVT_TIMER_TSCDEADLINE &&
+   lapic_timer_advance_dynamic)
+   apic->lapic_timer.timer_advance_ns = 0;
 }
 
 /*
-- 
2.7.4

Re: [PATCH v2] KVM: LAPIC: Narrow down the kick target vCPU

2020-08-23 Thread Wanpeng Li

On Mon, 24 Aug 2020 at 09:03, Wanpeng Li  wrote:
>
> From: Wanpeng Li 
>
> The kick after setting KVM_REQ_PENDING_TIMER is used to handle the timer
> fires on a different pCPU which vCPU is running on, this kick is expensive
> since memory barrier, rcu, preemption disable/enable operations. We don't
> need this kick when injecting already-expired timer, we also should call
> out the VMX preemption timer case, which also passes from_timer_fn=false
> but doesn't need a kick because kvm_lapic_expired_hv_timer() is called
> from the target vCPU.
>

I miss Sean's reviewed-by tag.

Reviewed-by: Sean Christopherson 

> Signed-off-by: Wanpeng Li 
> ---
> v1 -> v2:
>  * update patch subject and changelog
>  * open code kvm_set_pending_timer()
>
>  arch/x86/kvm/lapic.c | 4 +++-
>  arch/x86/kvm/x86.c   | 6 --
>  arch/x86/kvm/x86.h   | 1 -
>  3 files changed, 3 insertions(+), 8 deletions(-)
>
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 248095a..97f1dbf 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -1642,7 +1642,9 @@ static void apic_timer_expired(struct kvm_lapic *apic, 
> bool from_timer_fn)
> }
>
> atomic_inc(>lapic_timer.pending);
> -   kvm_set_pending_timer(vcpu);
> +   kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
> +   if (from_timer_fn)
> +   kvm_vcpu_kick(vcpu);
>  }
>
>  static void start_sw_tscdeadline(struct kvm_lapic *apic)
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 599d732..51b74d0 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1778,12 +1778,6 @@ static s64 get_kvmclock_base_ns(void)
>  }
>  #endif
>
> -void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
> -{
> -   kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
> -   kvm_vcpu_kick(vcpu);
> -}
> -
>  static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
>  {
> int version;
> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> index 995ab69..ea20b8b 100644
> --- a/arch/x86/kvm/x86.h
> +++ b/arch/x86/kvm/x86.h
> @@ -246,7 +246,6 @@ static inline bool kvm_vcpu_latch_init(struct kvm_vcpu 
> *vcpu)
> return is_smm(vcpu) || kvm_x86_ops.apic_init_signal_blocked(vcpu);
>  }
>
> -void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
>  void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int 
> inc_eip);
>
>  void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
> --
> 2.7.4
>

Re: [PATCH 1/2] KVM: LAPIC: Fix updating DFR missing apic map recalculation

2020-08-23 Thread Wanpeng Li

ping, :)
On Wed, 19 Aug 2020 at 16:55, Wanpeng Li  wrote:
>
> From: Wanpeng Li 
>
> There is missing apic map recalculation after updating DFR, if it is
> INIT RESET, in x2apic mode, local apic is software enabled before.
> This patch fix it by introducing the function kvm_apic_set_dfr() to
> be called in INIT RESET handling path.
>
> Signed-off-by: Wanpeng Li 
> ---
>  arch/x86/kvm/lapic.c | 15 ++-
>  1 file changed, 10 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 5ccbee7..248095a 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -310,6 +310,12 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic 
> *apic, u32 id)
> atomic_set_release(>vcpu->kvm->arch.apic_map_dirty, DIRTY);
>  }
>
> +static inline void kvm_apic_set_dfr(struct kvm_lapic *apic, u32 val)
> +{
> +   kvm_lapic_set_reg(apic, APIC_DFR, val);
> +   atomic_set_release(>vcpu->kvm->arch.apic_map_dirty, DIRTY);
> +}
> +
>  static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
>  {
> return ((id >> 4) << 16) | (1 << (id & 0xf));
> @@ -1984,10 +1990,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 
> reg, u32 val)
> break;
>
> case APIC_DFR:
> -   if (!apic_x2apic_mode(apic)) {
> -   kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFF);
> -   
> atomic_set_release(>vcpu->kvm->arch.apic_map_dirty, DIRTY);
> -   } else
> +   if (!apic_x2apic_mode(apic))
> +   kvm_apic_set_dfr(apic, val | 0x0FFF);
> +   else
> ret = 1;
> break;
>
> @@ -2303,7 +2308,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool 
> init_event)
>  SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
> apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
>
> -   kvm_lapic_set_reg(apic, APIC_DFR, 0xU);
> +   kvm_apic_set_dfr(apic, 0xU);
> apic_set_spiv(apic, 0xff);
> kvm_lapic_set_reg(apic, APIC_TASKPRI, 0);
> if (!apic_x2apic_mode(apic))
> --
> 2.7.4
>

Re: [PATCH v2 2/2] KVM: LAPIC: Guarantee the timer is in tsc-deadline mode when setting

2020-08-23 Thread Wanpeng Li

ping :)
On Wed, 12 Aug 2020 at 14:30, Wanpeng Li  wrote:
>
> From: Wanpeng Li 
>
> Check apic_lvtt_tscdeadline() mode directly instead of apic_lvtt_oneshot()
> and apic_lvtt_period() to guarantee the timer is in tsc-deadline mode when
> wrmsr MSR_IA32_TSCDEADLINE.
>
> Signed-off-by: Wanpeng Li 
> ---
> v1 -> v2:
>  * fix indentation
>
>  arch/x86/kvm/lapic.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
>
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 79599af..abaf48e 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -2193,8 +2193,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu 
> *vcpu, u64 data)
>  {
> struct kvm_lapic *apic = vcpu->arch.apic;
>
> -   if (!kvm_apic_present(vcpu) || apic_lvtt_oneshot(apic) ||
> -   apic_lvtt_period(apic))
> +   if (!kvm_apic_present(vcpu) || !apic_lvtt_tscdeadline(apic))
> return;
>
> hrtimer_cancel(>lapic_timer.timer);
> --
> 2.7.4
>

[PATCH v2] KVM: LAPIC: Narrow down the kick target vCPU

2020-08-23 Thread Wanpeng Li

From: Wanpeng Li 

The kick after setting KVM_REQ_PENDING_TIMER is used to handle the timer 
fires on a different pCPU which vCPU is running on, this kick is expensive 
since memory barrier, rcu, preemption disable/enable operations. We don't 
need this kick when injecting already-expired timer, we also should call 
out the VMX preemption timer case, which also passes from_timer_fn=false 
but doesn't need a kick because kvm_lapic_expired_hv_timer() is called 
from the target vCPU.

Signed-off-by: Wanpeng Li 
---
v1 -> v2:
 * update patch subject and changelog
 * open code kvm_set_pending_timer()

 arch/x86/kvm/lapic.c | 4 +++-
 arch/x86/kvm/x86.c   | 6 --
 arch/x86/kvm/x86.h   | 1 -
 3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 248095a..97f1dbf 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1642,7 +1642,9 @@ static void apic_timer_expired(struct kvm_lapic *apic, 
bool from_timer_fn)
}
 
atomic_inc(>lapic_timer.pending);
-   kvm_set_pending_timer(vcpu);
+   kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+   if (from_timer_fn)
+   kvm_vcpu_kick(vcpu);
 }
 
 static void start_sw_tscdeadline(struct kvm_lapic *apic)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 599d732..51b74d0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1778,12 +1778,6 @@ static s64 get_kvmclock_base_ns(void)
 }
 #endif
 
-void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
-{
-   kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
-   kvm_vcpu_kick(vcpu);
-}
-
 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 {
int version;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 995ab69..ea20b8b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -246,7 +246,6 @@ static inline bool kvm_vcpu_latch_init(struct kvm_vcpu 
*vcpu)
return is_smm(vcpu) || kvm_x86_ops.apic_init_signal_blocked(vcpu);
 }
 
-void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
 void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int 
inc_eip);
 
 void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
-- 
2.7.4

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 5240 matches

Mail list logo