This supports asynchronous page fault for the guest. The design is
similar to what x86 has: on receiving a PAGE_NOT_PRESENT signal from
the hypervisor, the current task is either rescheduled or put into
power-saving mode. The task will be waken up when PAGE_READY signal
is received.

The signals are conveyed through data abort with specific (IMPDEF)
Data Fault Status Code (DFSC). Besides, a hash table is introduced
to track the processes that have been put into waiting state, to
avoid out-of-consistency.

The feature is put into the CONFIG_KVM_GUEST umbrella, which is added
by this patch.

Signed-off-by: Gavin Shan <[email protected]>
---
 arch/arm64/Kconfig                 |  11 ++
 arch/arm64/include/asm/exception.h |   5 +
 arch/arm64/include/asm/kvm_para.h  |  42 ++++-
 arch/arm64/kernel/smp.c            |  47 ++++++
 arch/arm64/mm/fault.c              | 239 ++++++++++++++++++++++++++++-
 5 files changed, 336 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 40fb05d96c60..2d5e5ee62d6d 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1045,6 +1045,17 @@ config PARAVIRT
          under a hypervisor, potentially improving performance significantly
          over full virtualization.
 
+config KVM_GUEST
+       bool "KVM Guest Support"
+       depends on PARAVIRT
+       default y
+       help
+         This option enables various optimizations for running under the KVM
+         hypervisor. Overhead for the kernel when not running inside KVM should
+         be minimal.
+
+         In case of doubt, say Y
+
 config PARAVIRT_TIME_ACCOUNTING
        bool "Paravirtual steal time accounting"
        select PARAVIRT
diff --git a/arch/arm64/include/asm/exception.h 
b/arch/arm64/include/asm/exception.h
index 7a6e81ca23a8..17ac2db36472 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -46,4 +46,9 @@ void bad_el0_sync(struct pt_regs *regs, int reason, unsigned 
int esr);
 void do_cp15instr(unsigned int esr, struct pt_regs *regs);
 void do_el0_svc(struct pt_regs *regs);
 void do_el0_svc_compat(struct pt_regs *regs);
+
+#ifdef CONFIG_KVM_GUEST
+void kvm_pv_async_pf_enable(void);
+void kvm_pv_async_pf_disable(void);
+#endif /* CONFIG_KVM_GUEST */
 #endif /* __ASM_EXCEPTION_H */
diff --git a/arch/arm64/include/asm/kvm_para.h 
b/arch/arm64/include/asm/kvm_para.h
index 0ea481dd1c7a..a43bed479c2b 100644
--- a/arch/arm64/include/asm/kvm_para.h
+++ b/arch/arm64/include/asm/kvm_para.h
@@ -3,6 +3,30 @@
 #define _ASM_ARM_KVM_PARA_H
 
 #include <uapi/asm/kvm_para.h>
+#include <linux/of.h>
+
+#ifdef CONFIG_KVM_GUEST
+static inline int kvm_para_available(void)
+{
+       struct device_node *hyper_node;
+       int ret = 0;
+
+       hyper_node = of_find_node_by_path("/hypervisor");
+       if (!hyper_node)
+               return 0;
+
+       if (of_device_is_compatible(hyper_node, "linux,kvm"))
+               ret = 1;
+
+       of_node_put(hyper_node);
+       return ret;
+}
+#else
+static inline int kvm_para_available(void)
+{
+       return 0;
+}
+#endif /* CONFIG_KVM_GUEST */
 
 static inline bool kvm_check_and_clear_guest_paused(void)
 {
@@ -11,17 +35,21 @@ static inline bool kvm_check_and_clear_guest_paused(void)
 
 static inline unsigned int kvm_arch_para_features(void)
 {
-       return 0;
+       struct device_node *hyper_node;
+       unsigned int features = 0;
+
+       if (!kvm_para_available())
+               return 0;
+
+       hyper_node = of_find_node_by_path("/hypervisor");
+       of_property_read_u32(hyper_node, "para-features", &features);
+       of_node_put(hyper_node);
+
+       return features;
 }
 
 static inline unsigned int kvm_arch_para_hints(void)
 {
        return 0;
 }
-
-static inline bool kvm_para_available(void)
-{
-       return false;
-}
-
 #endif /* _ASM_ARM_KVM_PARA_H */
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 061f60fe452f..cc97a8462d7f 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -40,6 +40,7 @@
 #include <asm/cputype.h>
 #include <asm/cpu_ops.h>
 #include <asm/daifflags.h>
+#include <asm/exception.h>
 #include <asm/kvm_mmu.h>
 #include <asm/mmu_context.h>
 #include <asm/numa.h>
@@ -443,6 +444,38 @@ void __init smp_cpus_done(unsigned int max_cpus)
        mark_linear_text_alias_ro();
 }
 
+#ifdef CONFIG_KVM_GUEST
+static void kvm_cpu_reboot(void *unused)
+{
+       kvm_pv_async_pf_disable();
+}
+
+static int kvm_cpu_reboot_notify(struct notifier_block *nb,
+                                unsigned long code, void *unused)
+{
+       if (code == SYS_RESTART)
+               on_each_cpu(kvm_cpu_reboot, NULL, 1);
+
+       return NOTIFY_DONE;
+}
+
+static struct notifier_block kvm_cpu_reboot_nb = {
+       .notifier_call = kvm_cpu_reboot_notify,
+};
+
+static int kvm_cpu_online(unsigned int cpu)
+{
+       kvm_pv_async_pf_enable();
+       return 0;
+}
+
+static int kvm_cpu_offline(unsigned int cpu)
+{
+       kvm_pv_async_pf_disable();
+       return 0;
+}
+#endif /* CONFIG_KVM_GUEST */
+
 void __init smp_prepare_boot_cpu(void)
 {
        set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
@@ -458,6 +491,20 @@ void __init smp_prepare_boot_cpu(void)
        /* Conditionally switch to GIC PMR for interrupt masking */
        if (system_uses_irq_prio_masking())
                init_gic_priority_masking();
+
+
+       /* Enable async page fault */
+#ifdef CONFIG_KVM_GUEST
+       register_reboot_notifier(&kvm_cpu_reboot_nb);
+       if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+               "arm/kvm:online", kvm_cpu_online, kvm_cpu_offline) < 0) {
+               pr_warn("%s: Failed to install cpu hotplug callbacks\n",
+                       __func__);
+               return;
+       }
+
+       kvm_pv_async_pf_enable();
+#endif /* CONFIG_KVM_GUEST */
 }
 
 static u64 __init of_get_cpu_mpidr(struct device_node *dn)
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 1027851d469a..39c7570fe303 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -19,10 +19,12 @@
 #include <linux/page-flags.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/debug.h>
+#include <linux/swait.h>
 #include <linux/highmem.h>
 #include <linux/perf_event.h>
 #include <linux/preempt.h>
 #include <linux/hugetlb.h>
+#include <linux/kvm_para.h>
 
 #include <asm/acpi.h>
 #include <asm/bug.h>
@@ -48,8 +50,31 @@ struct fault_info {
        const char *name;
 };
 
+#ifdef CONFIG_KVM_GUEST
+#define KVM_TASK_SLEEP_HASHBITS                8
+#define KVM_TASK_SLEEP_HASHSIZE        (1 << KVM_TASK_SLEEP_HASHBITS)
+
+struct kvm_task_sleep_node {
+       struct hlist_node       link;
+       struct swait_queue_head wq;
+       u32                     token;
+       int                     cpu;
+       bool                    halted;
+};
+
+struct kvm_task_sleep_head {
+       raw_spinlock_t          lock;
+       struct hlist_head       list;
+};
+#endif /* CONFIG_KVM_GUEST */
+
 static const struct fault_info fault_info[];
 static struct fault_info debug_fault_info[];
+#ifdef CONFIG_KVM_GUEST
+static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_data) __aligned(64);
+static struct kvm_task_sleep_head async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
+static bool async_pf_initialized;
+#endif
 
 static inline const struct fault_info *esr_to_fault_info(unsigned int esr)
 {
@@ -623,6 +648,178 @@ static int do_alignment_fault(unsigned long addr, 
unsigned int esr,
        return 0;
 }
 
+#ifdef CONFIG_KVM_GUEST
+static struct kvm_task_sleep_node *kvm_pv_async_pf_find(
+               struct kvm_task_sleep_head *b, u32 token)
+{
+       struct kvm_task_sleep_node *n;
+       struct hlist_node *p;
+
+       hlist_for_each(p, &b->list) {
+               n = hlist_entry(p, typeof(*n), link);
+               if (n->token == token)
+                       return n;
+       }
+
+       return NULL;
+}
+
+static void kvm_pv_async_pf_wait(u32 token, int interrupt_kernel)
+{
+       u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
+       struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
+       struct kvm_task_sleep_node n, *e;
+       DECLARE_SWAITQUEUE(wait);
+
+       raw_spin_lock(&b->lock);
+       e = kvm_pv_async_pf_find(b, token);
+       if (e) {
+               /* dummy entry exist -> wake up was delivered ahead of PF */
+               hlist_del(&e->link);
+               kfree(e);
+               raw_spin_unlock(&b->lock);
+
+               return;
+       }
+
+       n.token = token;
+       n.cpu = smp_processor_id();
+       n.halted = is_idle_task(current) ||
+                  (IS_ENABLED(CONFIG_PREEMPT_COUNT) ?
+                       preempt_count() > 1 || rcu_preempt_depth() :
+                       interrupt_kernel);
+       init_swait_queue_head(&n.wq);
+       hlist_add_head(&n.link, &b->list);
+       raw_spin_unlock(&b->lock);
+
+       for (;;) {
+               if (!n.halted) {
+                       prepare_to_swait_exclusive(&n.wq, &wait,
+                                                  TASK_UNINTERRUPTIBLE);
+               }
+
+               if (hlist_unhashed(&n.link))
+                       break;
+
+               /*
+                * Enable the IRQ explicitly. Otherwise, the task
+                * won't be scheduled or waken up properly.
+                */
+               local_irq_enable();
+
+               if (!n.halted) {
+                       schedule();
+               } else {
+                       dsb(sy);
+                       wfi();
+               }
+
+               local_irq_disable();
+       }
+
+       if (!n.halted)
+               finish_swait(&n.wq, &wait);
+}
+
+static inline void kvm_pv_async_pf_wake_one(struct kvm_task_sleep_node *n)
+{
+       /* The task will be waken up once being detached */
+       hlist_del_init(&n->link);
+
+       if (!n->halted)
+               swake_up_one(&n->wq);
+       else
+               smp_send_reschedule(n->cpu);
+}
+
+static void kvm_pv_async_pf_wake_all(void)
+{
+       struct kvm_task_sleep_head *b;
+       struct kvm_task_sleep_node *n;
+       struct hlist_node *p, *next;
+       int i;
+
+       for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
+               b = &async_pf_sleepers[i];
+
+               raw_spin_lock(&b->lock);
+
+               hlist_for_each_safe(p, next, &b->list) {
+                       n = hlist_entry(p, typeof(*n), link);
+                       if (n->cpu != smp_processor_id())
+                               continue;
+
+                       kvm_pv_async_pf_wake_one(n);
+               }
+
+               raw_spin_unlock(&b->lock);
+       }
+}
+
+static void kvm_pv_async_pf_wake(u32 token)
+{
+       u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
+       struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
+       struct kvm_task_sleep_node *n;
+
+       if (token == ~0) {
+               kvm_pv_async_pf_wake_all();
+               return;
+       }
+
+again:
+       raw_spin_lock(&b->lock);
+
+       n = kvm_pv_async_pf_find(b, token);
+       if (!n) {
+               /*
+                * Async PF was not yet handled. Add dummy entry
+                * for the token. Busy wait until other CPU handles
+                * the async PF on allocation failure.
+                */
+               n = kzalloc(sizeof(*n), GFP_ATOMIC);
+               if (!n) {
+                       raw_spin_unlock(&b->lock);
+                       cpu_relax();
+                       goto again;
+               }
+               n->token = token;
+               n->cpu = smp_processor_id();
+               init_swait_queue_head(&n->wq);
+               hlist_add_head(&n->link, &b->list);
+       } else {
+               kvm_pv_async_pf_wake_one(n);
+       }
+
+       raw_spin_unlock(&b->lock);
+}
+#endif /* CONFIG_KVM_GUEST */
+
+static int do_lockdown(unsigned long addr, unsigned int esr,
+                      struct pt_regs *regs)
+{
+#ifdef CONFIG_KVM_GUEST
+       u32 reason = 0;
+
+       if (__this_cpu_read(apf_data.enabled)) {
+               reason = __this_cpu_read(apf_data.reason);
+               __this_cpu_write(apf_data.reason, 0);
+       }
+
+       switch (reason) {
+       case KVM_PV_REASON_PAGE_NOT_PRESENT:
+               kvm_pv_async_pf_wait((u32)addr, !user_mode(regs));
+               return 0;
+       case KVM_PV_REASON_PAGE_READY:
+               kvm_pv_async_pf_wake((u32)addr);
+               return 0;
+       }
+#endif /* CONFIG_KVM_GUEST */
+
+       pr_info("%s: addr=0x%lx, esr=0x%x\n", __func__, addr, esr);
+       return 1;
+}
+
 static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs)
 {
        return 1; /* "fault" */
@@ -703,7 +900,8 @@ static const struct fault_info fault_info[] = {
        { do_bad,               SIGKILL, SI_KERNEL,     "Unsupported atomic 
hardware update fault"      },
        { do_bad,               SIGKILL, SI_KERNEL,     "unknown 50"            
        },
        { do_bad,               SIGKILL, SI_KERNEL,     "unknown 51"            
        },
-       { do_bad,               SIGKILL, SI_KERNEL,     "implementation fault 
(lockdown abort)" },
+       { do_lockdown,          SIGKILL, SI_KERNEL,
+         "implementation fault (lockdown abort)" },
        { do_bad,               SIGBUS,  BUS_OBJERR,    "implementation fault 
(unsupported exclusive)" },
        { do_bad,               SIGKILL, SI_KERNEL,     "unknown 54"            
        },
        { do_bad,               SIGKILL, SI_KERNEL,     "unknown 55"            
        },
@@ -878,3 +1076,42 @@ void do_debug_exception(unsigned long addr_if_watchpoint, 
unsigned int esr,
        debug_exception_exit(regs);
 }
 NOKPROBE_SYMBOL(do_debug_exception);
+
+#ifdef CONFIG_KVM_GUEST
+void kvm_pv_async_pf_enable(void)
+{
+       u64 pa;
+       int i;
+
+       if (!kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) ||
+            __this_cpu_read(apf_data.enabled))
+               return;
+
+       if (!async_pf_initialized) {
+               async_pf_initialized = true;
+               for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
+                       raw_spin_lock_init(&async_pf_sleepers[i].lock);
+       }
+
+       /* FIXME: Enable KVM_ASYNC_PF_SEND_ALWAYS on CONFIG_PREEMPTION */
+       pa = virt_to_phys(this_cpu_ptr(&apf_data));
+       pa |= KVM_ASYNC_PF_ENABLED;
+
+       __this_cpu_write(apf_data.enabled, 1);
+       write_sysreg_s(pa, SYS_ASYNC_PF_EL1);
+
+       pr_info("Async PF enabled on CPU %d\n", smp_processor_id());
+}
+
+void kvm_pv_async_pf_disable(void)
+{
+       if (!kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) ||
+           !__this_cpu_read(apf_data.enabled))
+               return;
+
+       write_sysreg_s(0, SYS_ASYNC_PF_EL1);
+       __this_cpu_write(apf_data.enabled, 0);
+
+       pr_info("Async PF disabled on CPU %d\n", smp_processor_id());
+}
+#endif /* CONFIG_KVM_GUEST */
-- 
2.23.0

_______________________________________________
kvmarm mailing list
[email protected]
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm

Reply via email to