kvm: Restrict ASYNC_PF to user space

Paolo Bonzini Wed, 06 May 2020 00:00:42 -0700

On 05/05/20 15:16, Thomas Gleixner wrote:
> The async page fault injection into kernel space creates more problems than
> it solves. The host has absolutely no knowledge about the state of the
> guest if the fault happens in CPL0. The only restriction for the host is
> interrupt disabled state. If interrupts are enabled in the guest then the
> exception can hit arbitrary code. The HALT based wait in non-preemotible
> code is a hacky replacement for a proper hypercall.
> 
> For the ongoing work to restrict instrumentation and make the RCU idle
> interaction well defined the required extra work for supporting async
> pagefault in CPL0 is just not justified and creates complexity for a
> dubious benefit.
> 
> The CPL3 injection is well defined and does not cause any issues as it is
> more or less the same as a regular page fault from CPL3.
> 
> Suggested-by: Andy Lutomirski <[email protected]>
> Signed-off-by: Thomas Gleixner <[email protected]>
> ---
>  arch/x86/kernel/kvm.c |  100 
> +++-----------------------------------------------
>  1 file changed, 7 insertions(+), 93 deletions(-)
> 
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -75,7 +75,6 @@ struct kvm_task_sleep_node {
>       struct swait_queue_head wq;
>       u32 token;
>       int cpu;
> -     bool use_halt;
>  };
>  
>  static struct kvm_task_sleep_head {
> @@ -98,8 +97,7 @@ static struct kvm_task_sleep_node *_find
>       return NULL;
>  }
>  
> -static bool kvm_async_pf_queue_task(u32 token, bool use_halt,
> -                                 struct kvm_task_sleep_node *n)
> +static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n)
>  {
>       u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
>       struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
> @@ -117,7 +115,6 @@ static bool kvm_async_pf_queue_task(u32
>  
>       n->token = token;
>       n->cpu = smp_processor_id();
> -     n->use_halt = use_halt;
>       init_swait_queue_head(&n->wq);
>       hlist_add_head(&n->link, &b->list);
>       raw_spin_unlock(&b->lock);
> @@ -138,7 +135,7 @@ void kvm_async_pf_task_wait_schedule(u32
>  
>       lockdep_assert_irqs_disabled();
>  
> -     if (!kvm_async_pf_queue_task(token, false, &n))
> +     if (!kvm_async_pf_queue_task(token, &n))
>               return;
>  
>       for (;;) {
> @@ -154,91 +151,10 @@ void kvm_async_pf_task_wait_schedule(u32
>  }
>  EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule);
>  
> -/*
> - * Invoked from the async page fault handler.
> - */
> -static void kvm_async_pf_task_wait_halt(u32 token)
> -{
> -     struct kvm_task_sleep_node n;
> -
> -     if (!kvm_async_pf_queue_task(token, true, &n))
> -             return;
> -
> -     for (;;) {
> -             if (hlist_unhashed(&n.link))
> -                     break;
> -             /*
> -              * No point in doing anything about RCU here. Any RCU read
> -              * side critical section or RCU watching section can be
> -              * interrupted by VMEXITs and the host is free to keep the
> -              * vCPU scheduled out as long as it sees fit. This is not
> -              * any different just because of the halt induced voluntary
> -              * VMEXIT.
> -              *
> -              * Also the async page fault could have interrupted any RCU
> -              * watching context, so invoking rcu_irq_exit()/enter()
> -              * around this is not gaining anything.
> -              */
> -             native_safe_halt();
> -             local_irq_disable();
> -     }
> -}
> -
> -/* Invoked from the async page fault handler */
> -static void kvm_async_pf_task_wait(u32 token, bool usermode)
> -{
> -     bool can_schedule;
> -
> -     /*
> -      * No need to check whether interrupts were disabled because the
> -      * host will (hopefully) only inject an async page fault into
> -      * interrupt enabled regions.
> -      *
> -      * If CONFIG_PREEMPTION is enabled then check whether the code
> -      * which triggered the page fault is preemptible. This covers user
> -      * mode as well because preempt_count() is obviously 0 there.
> -      *
> -      * The check for rcu_preempt_depth() is also required because
> -      * voluntary scheduling inside a rcu read locked section is not
> -      * allowed.
> -      *
> -      * The idle task is already covered by this because idle always
> -      * has a preempt count > 0.
> -      *
> -      * If CONFIG_PREEMPTION is disabled only allow scheduling when
> -      * coming from user mode as there is no indication whether the
> -      * context which triggered the page fault could schedule or not.
> -      */
> -     if (IS_ENABLED(CONFIG_PREEMPTION))
> -             can_schedule = preempt_count() + rcu_preempt_depth() == 0;
> -     else
> -             can_schedule = usermode;
> -
> -     /*
> -      * If the kernel context is allowed to schedule then RCU is
> -      * watching because no preemptible code in the kernel is inside RCU
> -      * idle state. So it can be treated like user mode. User mode is
> -      * safe because the #PF entry invoked enter_from_user_mode().
> -      *
> -      * For the non schedulable case invoke rcu_irq_enter() for
> -      * now. This will be moved out to the pagefault entry code later
> -      * and only invoked when really needed.
> -      */
> -     if (can_schedule) {
> -             kvm_async_pf_task_wait_schedule(token);
> -     } else {
> -             rcu_irq_enter();
> -             kvm_async_pf_task_wait_halt(token);
> -             rcu_irq_exit();
> -     }
> -}
> -
>  static void apf_task_wake_one(struct kvm_task_sleep_node *n)
>  {
>       hlist_del_init(&n->link);
> -     if (n->use_halt)
> -             smp_send_reschedule(n->cpu);
> -     else if (swq_has_sleeper(&n->wq))
> +     if (swq_has_sleeper(&n->wq))
>               swake_up_one(&n->wq);
>  }
>  
> @@ -337,8 +253,10 @@ bool __kvm_handle_async_pf(struct pt_reg
>               panic("Host injected async #PF in interrupt disabled region\n");
>  
>       if (reason == KVM_PV_REASON_PAGE_NOT_PRESENT) {
> -             /* page is swapped out by the host. */
> -             kvm_async_pf_task_wait(token, user_mode(regs));
> +             if (unlikely(!(user_mode(regs))))
> +                     panic("Host injected async #PF in kernel mode\n");
> +             /* Page is swapped out by the host. */
> +             kvm_async_pf_task_wait_schedule(token);
>       } else {
>               rcu_irq_enter();
>               kvm_async_pf_task_wake(token);
> @@ -397,10 +315,6 @@ static void kvm_guest_cpu_init(void)
>               WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled));
>  
>               pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
> -
> -#ifdef CONFIG_PREEMPTION
> -             pa |= KVM_ASYNC_PF_SEND_ALWAYS;
> -#endif
>               pa |= KVM_ASYNC_PF_ENABLED;
>  
>               if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT))
>


Acked-by: Paolo Bonzini <[email protected]>

Re: [patch V4 part 1 13/36] x86/kvm: Restrict ASYNC_PF to user space

Reply via email to