Module Name: src Committed By: riastradh Date: Tue Sep 13 20:10:04 UTC 2022
Modified Files: src/sys/dev/nvmm: nvmm.c nvmm_internal.h src/sys/dev/nvmm/x86: nvmm_x86_vmx.c Log Message: nvmm(4): Add suspend/resume support. New MD nvmm_impl callbacks: - .suspend_interrupt forces all VMs on all physical CPUs to exit. - .vcpu_suspend suspends an individual vCPU on a machine. - .machine_suspend suspends an individual machine. - .suspend suspends the whole system. - .resume resumes the whole system. - .machine_resume resumes an individual machine. - .vcpu_resume resumes an indidivudal vCPU on a machine. Suspending nvmm: 1. causes new VM operations (ioctl and close) to block until resumed, 2. uses .suspend_interrupt to interrupt any concurrent and force them to return early, and then 3. uses the various suspend callbacks to suspend all vCPUs, machines, and the whole system -- all vCPUs before the machine they're on, and all machines before the system. Resuming nvmm does the reverse of (3) -- resume system, resume each machine and then the vCPUs on that machine -- and then unblocks operations. Implemented only for x86-vmx for now: - suspend_interrupt triggers a TLB IPI to cause VM exits; - vcpu_suspend issues VMCLEAR to force any in-CPU state to be written to memory; - machine_suspend does nothing; - suspend does VMXOFF on all CPUs; - resume does VMXON on all CPUs; - machine_resume does nothing; and - vcpu_resume just marks each vCPU as valid but inactive so subsequent use will clear it and load it with vmptrld. x86-svm left as an exercise for the reader. To generate a diff of this commit: cvs rdiff -u -r1.46 -r1.47 src/sys/dev/nvmm/nvmm.c cvs rdiff -u -r1.20 -r1.21 src/sys/dev/nvmm/nvmm_internal.h cvs rdiff -u -r1.84 -r1.85 src/sys/dev/nvmm/x86/nvmm_x86_vmx.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/dev/nvmm/nvmm.c diff -u src/sys/dev/nvmm/nvmm.c:1.46 src/sys/dev/nvmm/nvmm.c:1.47 --- src/sys/dev/nvmm/nvmm.c:1.46 Thu Jul 7 23:50:33 2022 +++ src/sys/dev/nvmm/nvmm.c Tue Sep 13 20:10:04 2022 @@ -1,4 +1,4 @@ -/* $NetBSD: nvmm.c,v 1.46 2022/07/07 23:50:33 pgoyette Exp $ */ +/* $NetBSD: nvmm.c,v 1.47 2022/09/13 20:10:04 riastradh Exp $ */ /* * Copyright (c) 2018-2020 Maxime Villard, m00nbsd.net @@ -29,7 +29,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: nvmm.c,v 1.46 2022/07/07 23:50:33 pgoyette Exp $"); +__KERNEL_RCSID(0, "$NetBSD: nvmm.c,v 1.47 2022/09/13 20:10:04 riastradh Exp $"); #include <sys/param.h> #include <sys/systm.h> @@ -59,6 +59,15 @@ __KERNEL_RCSID(0, "$NetBSD: nvmm.c,v 1.4 static struct nvmm_machine machines[NVMM_MAX_MACHINES]; static volatile unsigned int nmachines __cacheline_aligned; +static struct { + kmutex_t lock; + kcondvar_t suspendcv; + kcondvar_t resumecv; + unsigned users; +} suspension; + +volatile bool nvmm_suspending; + static const struct nvmm_impl *nvmm_impl_list[] = { #if defined(__x86_64__) &nvmm_x86_svm, /* x86 AMD SVM */ @@ -73,6 +82,50 @@ static struct nvmm_owner root_owner; /* -------------------------------------------------------------------------- */ static int +nvmm_enter_sig(void) +{ + int error; + + mutex_enter(&suspension.lock); + while (nvmm_suspending) { + error = cv_wait_sig(&suspension.resumecv, &suspension.lock); + if (error) + goto out; + } + KASSERT(suspension.users < UINT_MAX); + suspension.users++; + error = 0; +out: mutex_exit(&suspension.lock); + + return 0; +} + +static void +nvmm_enter(void) +{ + + mutex_enter(&suspension.lock); + while (nvmm_suspending) + cv_wait(&suspension.resumecv, &suspension.lock); + KASSERT(suspension.users < UINT_MAX); + suspension.users++; + mutex_exit(&suspension.lock); +} + +static void +nvmm_exit(void) +{ + + mutex_enter(&suspension.lock); + KASSERT(suspension.users > 0); + if (--suspension.users == 0) + cv_signal(&suspension.suspendcv); + mutex_exit(&suspension.lock); +} + +/* -------------------------------------------------------------------------- */ + +static int nvmm_machine_alloc(struct nvmm_machine **ret) { struct nvmm_machine *mach; @@ -989,6 +1042,11 @@ nvmm_init(void) } } + mutex_init(&suspension.lock, MUTEX_DEFAULT, IPL_NONE); + cv_init(&suspension.suspendcv, "nvmmsus"); + cv_init(&suspension.resumecv, "nvmmres"); + suspension.users = 0; + (*nvmm_impl->init)(); return 0; @@ -1080,7 +1138,11 @@ nvmm_close(file_t *fp) struct nvmm_owner *owner = fp->f_data; KASSERT(owner != NULL); + + nvmm_enter(); nvmm_kill_machines(owner); + nvmm_exit(); + if (owner != &root_owner) { kmem_free(owner, sizeof(*owner)); } @@ -1126,7 +1188,7 @@ nvmm_mmap(file_t *fp, off_t *offp, size_ } static int -nvmm_ioctl(file_t *fp, u_long cmd, void *data) +nvmm_ioctl_internal(file_t *fp, u_long cmd, void *data) { struct nvmm_owner *owner = fp->f_data; @@ -1170,11 +1232,27 @@ nvmm_ioctl(file_t *fp, u_long cmd, void } } +static int +nvmm_ioctl(struct file *fp, u_long cmd, void *data) +{ + int error; + + error = nvmm_enter_sig(); + if (error) + return error; + error = nvmm_ioctl_internal(fp, cmd, data); + nvmm_exit(); + + return error; +} + /* -------------------------------------------------------------------------- */ static int nvmm_match(device_t, cfdata_t, void *); static void nvmm_attach(device_t, device_t, void *); static int nvmm_detach(device_t, int); +static bool nvmm_suspend(device_t, const pmf_qual_t *); +static bool nvmm_resume(device_t, const pmf_qual_t *); extern struct cfdriver nvmm_cd; @@ -1209,6 +1287,8 @@ nvmm_attach(device_t parent, device_t se panic("%s: impossible", __func__); aprint_normal_dev(self, "attached, using backend %s\n", nvmm_impl->name); + if (nvmm_impl->suspend != NULL && nvmm_impl->resume != NULL) + pmf_device_register(self, nvmm_suspend, nvmm_resume); } static int @@ -1216,10 +1296,147 @@ nvmm_detach(device_t self, int flags) { if (atomic_load_relaxed(&nmachines) > 0) return EBUSY; + pmf_device_deregister(self); nvmm_fini(); return 0; } +static void +nvmm_suspend_vcpu(struct nvmm_machine *mach, struct nvmm_cpu *vcpu) +{ + + mutex_enter(&vcpu->lock); + if (vcpu->present && nvmm_impl->vcpu_suspend) + (*nvmm_impl->vcpu_suspend)(mach, vcpu); + mutex_exit(&vcpu->lock); +} + +static void +nvmm_resume_vcpu(struct nvmm_machine *mach, struct nvmm_cpu *vcpu) +{ + + mutex_enter(&vcpu->lock); + if (vcpu->present && nvmm_impl->vcpu_resume) + (*nvmm_impl->vcpu_resume)(mach, vcpu); + mutex_exit(&vcpu->lock); +} + +static void +nvmm_suspend_machine(struct nvmm_machine *mach) +{ + + rw_enter(&mach->lock, RW_WRITER); + if (mach->present) { + if (nvmm_impl->vcpu_suspend) { + size_t cpuid; + + for (cpuid = 0; cpuid < NVMM_MAX_VCPUS; cpuid++) + nvmm_suspend_vcpu(mach, &mach->cpus[cpuid]); + } + if (nvmm_impl->machine_suspend) + (*nvmm_impl->machine_suspend)(mach); + } + rw_exit(&mach->lock); +} + +static void +nvmm_resume_machine(struct nvmm_machine *mach) +{ + + rw_enter(&mach->lock, RW_WRITER); + if (mach->present) { + if (nvmm_impl->vcpu_resume) { + size_t cpuid; + + for (cpuid = 0; cpuid < NVMM_MAX_VCPUS; cpuid++) + nvmm_resume_vcpu(mach, &mach->cpus[cpuid]); + } + if (nvmm_impl->machine_resume) + (*nvmm_impl->machine_resume)(mach); + } + rw_exit(&mach->lock); +} + +static bool +nvmm_suspend(device_t self, const pmf_qual_t *qual) +{ + size_t i; + + /* + * Prevent new users (via ioctl) from starting. + */ + mutex_enter(&suspension.lock); + KASSERT(!nvmm_suspending); + atomic_store_relaxed(&nvmm_suspending, true); + mutex_exit(&suspension.lock); + + /* + * Interrupt any running VMs so they will break out of run + * loops or anything else and not start up again until we've + * resumed. + */ + if (nvmm_impl->suspend_interrupt) + (*nvmm_impl->suspend_interrupt)(); + + /* + * Wait for any running VMs or other ioctls to finish running + * or handling any other ioctls. + */ + mutex_enter(&suspension.lock); + while (suspension.users) + cv_wait(&suspension.suspendcv, &suspension.lock); + mutex_exit(&suspension.lock); + + /* + * Suspend all the machines. + */ + if (nvmm_impl->machine_suspend || nvmm_impl->vcpu_suspend) { + for (i = 0; i < NVMM_MAX_MACHINES; i++) + nvmm_suspend_machine(&machines[i]); + } + + /* + * Take any systemwide suspend action. + */ + if (nvmm_impl->suspend) + (*nvmm_impl->suspend)(); + + return true; +} + +static bool +nvmm_resume(device_t self, const pmf_qual_t *qual) +{ + size_t i; + + KASSERT(atomic_load_relaxed(&nvmm_suspending)); + KASSERT(suspension.users == 0); + + /* + * Take any systemwide resume action. + */ + if (nvmm_impl->resume) + (*nvmm_impl->resume)(); + + /* + * Resume all the machines. + */ + if (nvmm_impl->machine_resume || nvmm_impl->vcpu_resume) { + for (i = 0; i < NVMM_MAX_MACHINES; i++) + nvmm_resume_machine(&machines[i]); + } + + /* + * Allow new users (via ioctl) to start again. + */ + mutex_enter(&suspension.lock); + atomic_store_relaxed(&nvmm_suspending, false); + cv_broadcast(&suspension.resumecv); + mutex_exit(&suspension.lock); + + return true; +} + void nvmmattach(int nunits) { Index: src/sys/dev/nvmm/nvmm_internal.h diff -u src/sys/dev/nvmm/nvmm_internal.h:1.20 src/sys/dev/nvmm/nvmm_internal.h:1.21 --- src/sys/dev/nvmm/nvmm_internal.h:1.20 Fri Mar 26 15:59:53 2021 +++ src/sys/dev/nvmm/nvmm_internal.h Tue Sep 13 20:10:04 2022 @@ -1,4 +1,4 @@ -/* $NetBSD: nvmm_internal.h,v 1.20 2021/03/26 15:59:53 reinoud Exp $ */ +/* $NetBSD: nvmm_internal.h,v 1.21 2022/09/13 20:10:04 riastradh Exp $ */ /* * Copyright (c) 2018-2020 Maxime Villard, m00nbsd.net @@ -107,6 +107,9 @@ struct nvmm_impl { void (*init)(void); void (*fini)(void); void (*capability)(struct nvmm_capability *); + void (*suspend_interrupt)(void); + void (*suspend)(void); + void (*resume)(void); size_t mach_conf_max; const size_t *mach_conf_sizes; @@ -119,6 +122,8 @@ struct nvmm_impl { void (*machine_create)(struct nvmm_machine *); void (*machine_destroy)(struct nvmm_machine *); int (*machine_configure)(struct nvmm_machine *, uint64_t, void *); + void (*machine_suspend)(struct nvmm_machine *); + void (*machine_resume)(struct nvmm_machine *); int (*vcpu_create)(struct nvmm_machine *, struct nvmm_cpu *); void (*vcpu_destroy)(struct nvmm_machine *, struct nvmm_cpu *); @@ -128,6 +133,8 @@ struct nvmm_impl { int (*vcpu_inject)(struct nvmm_cpu *); int (*vcpu_run)(struct nvmm_machine *, struct nvmm_cpu *, struct nvmm_vcpu_exit *); + void (*vcpu_suspend)(struct nvmm_machine *, struct nvmm_cpu *); + void (*vcpu_resume)(struct nvmm_machine *, struct nvmm_cpu *); }; #if defined(__x86_64__) @@ -135,6 +142,8 @@ extern const struct nvmm_impl nvmm_x86_s extern const struct nvmm_impl nvmm_x86_vmx; #endif +extern volatile bool nvmm_suspending; + static inline bool nvmm_return_needed(struct nvmm_cpu *vcpu, struct nvmm_vcpu_exit *exit) { @@ -151,6 +160,10 @@ nvmm_return_needed(struct nvmm_cpu *vcpu exit->reason = NVMM_VCPU_EXIT_STOPPED; return true; } + if (atomic_load_relaxed(&nvmm_suspending)) { + exit->reason = NVMM_VCPU_EXIT_NONE; + return true; + } return false; } Index: src/sys/dev/nvmm/x86/nvmm_x86_vmx.c diff -u src/sys/dev/nvmm/x86/nvmm_x86_vmx.c:1.84 src/sys/dev/nvmm/x86/nvmm_x86_vmx.c:1.85 --- src/sys/dev/nvmm/x86/nvmm_x86_vmx.c:1.84 Sat Aug 20 23:48:51 2022 +++ src/sys/dev/nvmm/x86/nvmm_x86_vmx.c Tue Sep 13 20:10:04 2022 @@ -1,4 +1,4 @@ -/* $NetBSD: nvmm_x86_vmx.c,v 1.84 2022/08/20 23:48:51 riastradh Exp $ */ +/* $NetBSD: nvmm_x86_vmx.c,v 1.85 2022/09/13 20:10:04 riastradh Exp $ */ /* * Copyright (c) 2018-2020 Maxime Villard, m00nbsd.net @@ -29,7 +29,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: nvmm_x86_vmx.c,v 1.84 2022/08/20 23:48:51 riastradh Exp $"); +__KERNEL_RCSID(0, "$NetBSD: nvmm_x86_vmx.c,v 1.85 2022/09/13 20:10:04 riastradh Exp $"); #include <sys/param.h> #include <sys/systm.h> @@ -3135,6 +3135,41 @@ vmx_vcpu_configure(struct nvmm_cpu *vcpu } } +static void +vmx_vcpu_suspend(struct nvmm_machine *mach, struct nvmm_cpu *vcpu) +{ + struct vmx_cpudata *cpudata = vcpu->cpudata; + struct cpu_info *vmcs_ci; + + KASSERT(cpudata->vmcs_refcnt == 0); + + vmcs_ci = cpudata->vmcs_ci; + cpudata->vmcs_ci = (void *)0x00FFFFFFFFFFFFFF; /* clobber */ + + kpreempt_disable(); + if (vmcs_ci == NULL) { + /* VMCS is inactive, nothing to do. */ + } else if (vmcs_ci != curcpu()) { + /* VMCS is active on a remote CPU; clear it there. */ + vmx_vmclear_remote(vmcs_ci, cpudata->vmcs_pa); + } else { + /* VMCS is active on this CPU; clear it here. */ + vmx_vmclear(&cpudata->vmcs_pa); + } + kpreempt_enable(); +} + +static void +vmx_vcpu_resume(struct nvmm_machine *mach, struct nvmm_cpu *vcpu) +{ + struct vmx_cpudata *cpudata = vcpu->cpudata; + + KASSERT(cpudata->vmcs_refcnt == 0); + + /* Mark VMCS as inactive. */ + cpudata->vmcs_ci = NULL; +} + /* -------------------------------------------------------------------------- */ static void @@ -3463,11 +3498,41 @@ vmx_init_l1tf(void) } static void +vmx_suspend_interrupt(void) +{ + + /* + * Generates IPIs, which cause #VMEXITs. No other purpose for + * the TLB business; the #VMEXIT triggered by IPI is the only + * effect that matters here. + */ + pmap_tlb_shootdown(pmap_kernel(), -1, PTE_G, TLBSHOOT_NVMM); +} + +static void +vmx_suspend(void) +{ + uint64_t xc; + + xc = xc_broadcast(0, vmx_change_cpu, (void *)false, NULL); + xc_wait(xc); +} + +static void +vmx_resume(void) +{ + uint64_t xc; + + xc = xc_broadcast(0, vmx_change_cpu, (void *)true, NULL); + xc_wait(xc); +} + +static void vmx_init(void) { CPU_INFO_ITERATOR cii; struct cpu_info *ci; - uint64_t xc, msr; + uint64_t msr; struct vmxon *vmxon; uint32_t revision; u_int descs[4]; @@ -3524,8 +3589,7 @@ vmx_init(void) vmxon->ident = __SHIFTIN(revision, VMXON_IDENT_REVISION); } - xc = xc_broadcast(0, vmx_change_cpu, (void *)true, NULL); - xc_wait(xc); + vmx_resume(); } static void @@ -3542,11 +3606,9 @@ vmx_fini_asid(void) static void vmx_fini(void) { - uint64_t xc; size_t i; - xc = xc_broadcast(0, vmx_change_cpu, (void *)false, NULL); - xc_wait(xc); + vmx_suspend(); for (i = 0; i < MAXCPUS; i++) { if (vmxoncpu[i].pa != 0) @@ -3573,6 +3635,9 @@ const struct nvmm_impl nvmm_x86_vmx = { .ident = vmx_ident, .init = vmx_init, .fini = vmx_fini, + .suspend_interrupt = vmx_suspend_interrupt, + .suspend = vmx_suspend, + .resume = vmx_resume, .capability = vmx_capability, .mach_conf_max = NVMM_X86_MACH_NCONF, .mach_conf_sizes = NULL, @@ -3588,5 +3653,7 @@ const struct nvmm_impl nvmm_x86_vmx = { .vcpu_setstate = vmx_vcpu_setstate, .vcpu_getstate = vmx_vcpu_getstate, .vcpu_inject = vmx_vcpu_inject, - .vcpu_run = vmx_vcpu_run + .vcpu_run = vmx_vcpu_run, + .vcpu_suspend = vmx_vcpu_suspend, + .vcpu_resume = vmx_vcpu_resume, };