Module Name:    src
Committed By:   riastradh
Date:           Tue Sep 13 20:10:04 UTC 2022

Modified Files:
        src/sys/dev/nvmm: nvmm.c nvmm_internal.h
        src/sys/dev/nvmm/x86: nvmm_x86_vmx.c

Log Message:
nvmm(4): Add suspend/resume support.

New MD nvmm_impl callbacks:

- .suspend_interrupt forces all VMs on all physical CPUs to exit.
- .vcpu_suspend suspends an individual vCPU on a machine.
- .machine_suspend suspends an individual machine.
- .suspend suspends the whole system.
- .resume resumes the whole system.
- .machine_resume resumes an individual machine.
- .vcpu_resume resumes an indidivudal vCPU on a machine.

Suspending nvmm:

1. causes new VM operations (ioctl and close) to block until resumed,
2. uses .suspend_interrupt to interrupt any concurrent and force them
   to return early, and then
3. uses the various suspend callbacks to suspend all vCPUs, machines,
   and the whole system -- all vCPUs before the machine they're on,
   and all machines before the system.

Resuming nvmm does the reverse of (3) -- resume system, resume each
machine and then the vCPUs on that machine -- and then unblocks
operations.

Implemented only for x86-vmx for now:

- suspend_interrupt triggers a TLB IPI to cause VM exits;
- vcpu_suspend issues VMCLEAR to force any in-CPU state to be written
  to memory;
- machine_suspend does nothing;
- suspend does VMXOFF on all CPUs;
- resume does VMXON on all CPUs;
- machine_resume does nothing; and
- vcpu_resume just marks each vCPU as valid but inactive so
  subsequent use will clear it and load it with vmptrld.

x86-svm left as an exercise for the reader.


To generate a diff of this commit:
cvs rdiff -u -r1.46 -r1.47 src/sys/dev/nvmm/nvmm.c
cvs rdiff -u -r1.20 -r1.21 src/sys/dev/nvmm/nvmm_internal.h
cvs rdiff -u -r1.84 -r1.85 src/sys/dev/nvmm/x86/nvmm_x86_vmx.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/dev/nvmm/nvmm.c
diff -u src/sys/dev/nvmm/nvmm.c:1.46 src/sys/dev/nvmm/nvmm.c:1.47
--- src/sys/dev/nvmm/nvmm.c:1.46	Thu Jul  7 23:50:33 2022
+++ src/sys/dev/nvmm/nvmm.c	Tue Sep 13 20:10:04 2022
@@ -1,4 +1,4 @@
-/*	$NetBSD: nvmm.c,v 1.46 2022/07/07 23:50:33 pgoyette Exp $	*/
+/*	$NetBSD: nvmm.c,v 1.47 2022/09/13 20:10:04 riastradh Exp $	*/
 
 /*
  * Copyright (c) 2018-2020 Maxime Villard, m00nbsd.net
@@ -29,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: nvmm.c,v 1.46 2022/07/07 23:50:33 pgoyette Exp $");
+__KERNEL_RCSID(0, "$NetBSD: nvmm.c,v 1.47 2022/09/13 20:10:04 riastradh Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -59,6 +59,15 @@ __KERNEL_RCSID(0, "$NetBSD: nvmm.c,v 1.4
 static struct nvmm_machine machines[NVMM_MAX_MACHINES];
 static volatile unsigned int nmachines __cacheline_aligned;
 
+static struct {
+	kmutex_t	lock;
+	kcondvar_t	suspendcv;
+	kcondvar_t	resumecv;
+	unsigned	users;
+} suspension;
+
+volatile bool nvmm_suspending;
+
 static const struct nvmm_impl *nvmm_impl_list[] = {
 #if defined(__x86_64__)
 	&nvmm_x86_svm,	/* x86 AMD SVM */
@@ -73,6 +82,50 @@ static struct nvmm_owner root_owner;
 /* -------------------------------------------------------------------------- */
 
 static int
+nvmm_enter_sig(void)
+{
+	int error;
+
+	mutex_enter(&suspension.lock);
+	while (nvmm_suspending) {
+		error = cv_wait_sig(&suspension.resumecv, &suspension.lock);
+		if (error)
+			goto out;
+	}
+	KASSERT(suspension.users < UINT_MAX);
+	suspension.users++;
+	error = 0;
+out:	mutex_exit(&suspension.lock);
+
+	return 0;
+}
+
+static void
+nvmm_enter(void)
+{
+
+	mutex_enter(&suspension.lock);
+	while (nvmm_suspending)
+		cv_wait(&suspension.resumecv, &suspension.lock);
+	KASSERT(suspension.users < UINT_MAX);
+	suspension.users++;
+	mutex_exit(&suspension.lock);
+}
+
+static void
+nvmm_exit(void)
+{
+
+	mutex_enter(&suspension.lock);
+	KASSERT(suspension.users > 0);
+	if (--suspension.users == 0)
+		cv_signal(&suspension.suspendcv);
+	mutex_exit(&suspension.lock);
+}
+
+/* -------------------------------------------------------------------------- */
+
+static int
 nvmm_machine_alloc(struct nvmm_machine **ret)
 {
 	struct nvmm_machine *mach;
@@ -989,6 +1042,11 @@ nvmm_init(void)
 		}
 	}
 
+	mutex_init(&suspension.lock, MUTEX_DEFAULT, IPL_NONE);
+	cv_init(&suspension.suspendcv, "nvmmsus");
+	cv_init(&suspension.resumecv, "nvmmres");
+	suspension.users = 0;
+
 	(*nvmm_impl->init)();
 
 	return 0;
@@ -1080,7 +1138,11 @@ nvmm_close(file_t *fp)
 	struct nvmm_owner *owner = fp->f_data;
 
 	KASSERT(owner != NULL);
+
+	nvmm_enter();
 	nvmm_kill_machines(owner);
+	nvmm_exit();
+
 	if (owner != &root_owner) {
 		kmem_free(owner, sizeof(*owner));
 	}
@@ -1126,7 +1188,7 @@ nvmm_mmap(file_t *fp, off_t *offp, size_
 }
 
 static int
-nvmm_ioctl(file_t *fp, u_long cmd, void *data)
+nvmm_ioctl_internal(file_t *fp, u_long cmd, void *data)
 {
 	struct nvmm_owner *owner = fp->f_data;
 
@@ -1170,11 +1232,27 @@ nvmm_ioctl(file_t *fp, u_long cmd, void 
 	}
 }
 
+static int
+nvmm_ioctl(struct file *fp, u_long cmd, void *data)
+{
+	int error;
+
+	error = nvmm_enter_sig();
+	if (error)
+		return error;
+	error = nvmm_ioctl_internal(fp, cmd, data);
+	nvmm_exit();
+
+	return error;
+}
+
 /* -------------------------------------------------------------------------- */
 
 static int nvmm_match(device_t, cfdata_t, void *);
 static void nvmm_attach(device_t, device_t, void *);
 static int nvmm_detach(device_t, int);
+static bool nvmm_suspend(device_t, const pmf_qual_t *);
+static bool nvmm_resume(device_t, const pmf_qual_t *);
 
 extern struct cfdriver nvmm_cd;
 
@@ -1209,6 +1287,8 @@ nvmm_attach(device_t parent, device_t se
 		panic("%s: impossible", __func__);
 	aprint_normal_dev(self, "attached, using backend %s\n",
 	    nvmm_impl->name);
+	if (nvmm_impl->suspend != NULL && nvmm_impl->resume != NULL)
+		pmf_device_register(self, nvmm_suspend, nvmm_resume);
 }
 
 static int
@@ -1216,10 +1296,147 @@ nvmm_detach(device_t self, int flags)
 {
 	if (atomic_load_relaxed(&nmachines) > 0)
 		return EBUSY;
+	pmf_device_deregister(self);
 	nvmm_fini();
 	return 0;
 }
 
+static void
+nvmm_suspend_vcpu(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
+{
+
+	mutex_enter(&vcpu->lock);
+	if (vcpu->present && nvmm_impl->vcpu_suspend)
+		(*nvmm_impl->vcpu_suspend)(mach, vcpu);
+	mutex_exit(&vcpu->lock);
+}
+
+static void
+nvmm_resume_vcpu(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
+{
+
+	mutex_enter(&vcpu->lock);
+	if (vcpu->present && nvmm_impl->vcpu_resume)
+		(*nvmm_impl->vcpu_resume)(mach, vcpu);
+	mutex_exit(&vcpu->lock);
+}
+
+static void
+nvmm_suspend_machine(struct nvmm_machine *mach)
+{
+
+	rw_enter(&mach->lock, RW_WRITER);
+	if (mach->present) {
+		if (nvmm_impl->vcpu_suspend) {
+			size_t cpuid;
+
+			for (cpuid = 0; cpuid < NVMM_MAX_VCPUS; cpuid++)
+				nvmm_suspend_vcpu(mach, &mach->cpus[cpuid]);
+		}
+		if (nvmm_impl->machine_suspend)
+			(*nvmm_impl->machine_suspend)(mach);
+	}
+	rw_exit(&mach->lock);
+}
+
+static void
+nvmm_resume_machine(struct nvmm_machine *mach)
+{
+
+	rw_enter(&mach->lock, RW_WRITER);
+	if (mach->present) {
+		if (nvmm_impl->vcpu_resume) {
+			size_t cpuid;
+
+			for (cpuid = 0; cpuid < NVMM_MAX_VCPUS; cpuid++)
+				nvmm_resume_vcpu(mach, &mach->cpus[cpuid]);
+		}
+		if (nvmm_impl->machine_resume)
+			(*nvmm_impl->machine_resume)(mach);
+	}
+	rw_exit(&mach->lock);
+}
+
+static bool
+nvmm_suspend(device_t self, const pmf_qual_t *qual)
+{
+	size_t i;
+
+	/*
+	 * Prevent new users (via ioctl) from starting.
+	 */
+	mutex_enter(&suspension.lock);
+	KASSERT(!nvmm_suspending);
+	atomic_store_relaxed(&nvmm_suspending, true);
+	mutex_exit(&suspension.lock);
+
+	/*
+	 * Interrupt any running VMs so they will break out of run
+	 * loops or anything else and not start up again until we've
+	 * resumed.
+	 */
+	if (nvmm_impl->suspend_interrupt)
+		(*nvmm_impl->suspend_interrupt)();
+
+	/*
+	 * Wait for any running VMs or other ioctls to finish running
+	 * or handling any other ioctls.
+	 */
+	mutex_enter(&suspension.lock);
+	while (suspension.users)
+		cv_wait(&suspension.suspendcv, &suspension.lock);
+	mutex_exit(&suspension.lock);
+
+	/*
+	 * Suspend all the machines.
+	 */
+	if (nvmm_impl->machine_suspend || nvmm_impl->vcpu_suspend) {
+		for (i = 0; i < NVMM_MAX_MACHINES; i++)
+			nvmm_suspend_machine(&machines[i]);
+	}
+
+	/*
+	 * Take any systemwide suspend action.
+	 */
+	if (nvmm_impl->suspend)
+		(*nvmm_impl->suspend)();
+
+	return true;
+}
+
+static bool
+nvmm_resume(device_t self, const pmf_qual_t *qual)
+{
+	size_t i;
+
+	KASSERT(atomic_load_relaxed(&nvmm_suspending));
+	KASSERT(suspension.users == 0);
+
+	/*
+	 * Take any systemwide resume action.
+	 */
+	if (nvmm_impl->resume)
+		(*nvmm_impl->resume)();
+
+	/*
+	 * Resume all the machines.
+	 */
+	if (nvmm_impl->machine_resume || nvmm_impl->vcpu_resume) {
+		for (i = 0; i < NVMM_MAX_MACHINES; i++)
+			nvmm_resume_machine(&machines[i]);
+	}
+
+	/*
+	 * Allow new users (via ioctl) to start again.
+	 */
+	mutex_enter(&suspension.lock);
+	atomic_store_relaxed(&nvmm_suspending, false);
+	cv_broadcast(&suspension.resumecv);
+	mutex_exit(&suspension.lock);
+
+	return true;
+}
+
 void
 nvmmattach(int nunits)
 {

Index: src/sys/dev/nvmm/nvmm_internal.h
diff -u src/sys/dev/nvmm/nvmm_internal.h:1.20 src/sys/dev/nvmm/nvmm_internal.h:1.21
--- src/sys/dev/nvmm/nvmm_internal.h:1.20	Fri Mar 26 15:59:53 2021
+++ src/sys/dev/nvmm/nvmm_internal.h	Tue Sep 13 20:10:04 2022
@@ -1,4 +1,4 @@
-/*	$NetBSD: nvmm_internal.h,v 1.20 2021/03/26 15:59:53 reinoud Exp $	*/
+/*	$NetBSD: nvmm_internal.h,v 1.21 2022/09/13 20:10:04 riastradh Exp $	*/
 
 /*
  * Copyright (c) 2018-2020 Maxime Villard, m00nbsd.net
@@ -107,6 +107,9 @@ struct nvmm_impl {
 	void (*init)(void);
 	void (*fini)(void);
 	void (*capability)(struct nvmm_capability *);
+	void (*suspend_interrupt)(void);
+	void (*suspend)(void);
+	void (*resume)(void);
 
 	size_t mach_conf_max;
 	const size_t *mach_conf_sizes;
@@ -119,6 +122,8 @@ struct nvmm_impl {
 	void (*machine_create)(struct nvmm_machine *);
 	void (*machine_destroy)(struct nvmm_machine *);
 	int (*machine_configure)(struct nvmm_machine *, uint64_t, void *);
+	void (*machine_suspend)(struct nvmm_machine *);
+	void (*machine_resume)(struct nvmm_machine *);
 
 	int (*vcpu_create)(struct nvmm_machine *, struct nvmm_cpu *);
 	void (*vcpu_destroy)(struct nvmm_machine *, struct nvmm_cpu *);
@@ -128,6 +133,8 @@ struct nvmm_impl {
 	int (*vcpu_inject)(struct nvmm_cpu *);
 	int (*vcpu_run)(struct nvmm_machine *, struct nvmm_cpu *,
 	    struct nvmm_vcpu_exit *);
+	void (*vcpu_suspend)(struct nvmm_machine *, struct nvmm_cpu *);
+	void (*vcpu_resume)(struct nvmm_machine *, struct nvmm_cpu *);
 };
 
 #if defined(__x86_64__)
@@ -135,6 +142,8 @@ extern const struct nvmm_impl nvmm_x86_s
 extern const struct nvmm_impl nvmm_x86_vmx;
 #endif
 
+extern volatile bool nvmm_suspending;
+
 static inline bool
 nvmm_return_needed(struct nvmm_cpu *vcpu, struct nvmm_vcpu_exit *exit)
 {
@@ -151,6 +160,10 @@ nvmm_return_needed(struct nvmm_cpu *vcpu
 		exit->reason = NVMM_VCPU_EXIT_STOPPED;
 		return true;
 	}
+	if (atomic_load_relaxed(&nvmm_suspending)) {
+		exit->reason = NVMM_VCPU_EXIT_NONE;
+		return true;
+	}
 
 	return false;
 }

Index: src/sys/dev/nvmm/x86/nvmm_x86_vmx.c
diff -u src/sys/dev/nvmm/x86/nvmm_x86_vmx.c:1.84 src/sys/dev/nvmm/x86/nvmm_x86_vmx.c:1.85
--- src/sys/dev/nvmm/x86/nvmm_x86_vmx.c:1.84	Sat Aug 20 23:48:51 2022
+++ src/sys/dev/nvmm/x86/nvmm_x86_vmx.c	Tue Sep 13 20:10:04 2022
@@ -1,4 +1,4 @@
-/*	$NetBSD: nvmm_x86_vmx.c,v 1.84 2022/08/20 23:48:51 riastradh Exp $	*/
+/*	$NetBSD: nvmm_x86_vmx.c,v 1.85 2022/09/13 20:10:04 riastradh Exp $	*/
 
 /*
  * Copyright (c) 2018-2020 Maxime Villard, m00nbsd.net
@@ -29,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: nvmm_x86_vmx.c,v 1.84 2022/08/20 23:48:51 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: nvmm_x86_vmx.c,v 1.85 2022/09/13 20:10:04 riastradh Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -3135,6 +3135,41 @@ vmx_vcpu_configure(struct nvmm_cpu *vcpu
 	}
 }
 
+static void
+vmx_vcpu_suspend(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
+{
+	struct vmx_cpudata *cpudata = vcpu->cpudata;
+	struct cpu_info *vmcs_ci;
+
+	KASSERT(cpudata->vmcs_refcnt == 0);
+
+	vmcs_ci = cpudata->vmcs_ci;
+	cpudata->vmcs_ci = (void *)0x00FFFFFFFFFFFFFF; /* clobber */
+
+	kpreempt_disable();
+	if (vmcs_ci == NULL) {
+		/* VMCS is inactive, nothing to do.  */
+	} else if (vmcs_ci != curcpu()) {
+		/* VMCS is active on a remote CPU; clear it there.  */
+		vmx_vmclear_remote(vmcs_ci, cpudata->vmcs_pa);
+	} else {
+		/* VMCS is active on this CPU; clear it here.  */
+		vmx_vmclear(&cpudata->vmcs_pa);
+	}
+	kpreempt_enable();
+}
+
+static void
+vmx_vcpu_resume(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
+{
+	struct vmx_cpudata *cpudata = vcpu->cpudata;
+
+	KASSERT(cpudata->vmcs_refcnt == 0);
+
+	/* Mark VMCS as inactive.  */
+	cpudata->vmcs_ci = NULL;
+}
+
 /* -------------------------------------------------------------------------- */
 
 static void
@@ -3463,11 +3498,41 @@ vmx_init_l1tf(void)
 }
 
 static void
+vmx_suspend_interrupt(void)
+{
+
+	/*
+	 * Generates IPIs, which cause #VMEXITs.  No other purpose for
+	 * the TLB business; the #VMEXIT triggered by IPI is the only
+	 * effect that matters here.
+	 */
+	pmap_tlb_shootdown(pmap_kernel(), -1, PTE_G, TLBSHOOT_NVMM);
+}
+
+static void
+vmx_suspend(void)
+{
+	uint64_t xc;
+
+	xc = xc_broadcast(0, vmx_change_cpu, (void *)false, NULL);
+	xc_wait(xc);
+}
+
+static void
+vmx_resume(void)
+{
+	uint64_t xc;
+
+	xc = xc_broadcast(0, vmx_change_cpu, (void *)true, NULL);
+	xc_wait(xc);
+}
+
+static void
 vmx_init(void)
 {
 	CPU_INFO_ITERATOR cii;
 	struct cpu_info *ci;
-	uint64_t xc, msr;
+	uint64_t msr;
 	struct vmxon *vmxon;
 	uint32_t revision;
 	u_int descs[4];
@@ -3524,8 +3589,7 @@ vmx_init(void)
 		vmxon->ident = __SHIFTIN(revision, VMXON_IDENT_REVISION);
 	}
 
-	xc = xc_broadcast(0, vmx_change_cpu, (void *)true, NULL);
-	xc_wait(xc);
+	vmx_resume();
 }
 
 static void
@@ -3542,11 +3606,9 @@ vmx_fini_asid(void)
 static void
 vmx_fini(void)
 {
-	uint64_t xc;
 	size_t i;
 
-	xc = xc_broadcast(0, vmx_change_cpu, (void *)false, NULL);
-	xc_wait(xc);
+	vmx_suspend();
 
 	for (i = 0; i < MAXCPUS; i++) {
 		if (vmxoncpu[i].pa != 0)
@@ -3573,6 +3635,9 @@ const struct nvmm_impl nvmm_x86_vmx = {
 	.ident = vmx_ident,
 	.init = vmx_init,
 	.fini = vmx_fini,
+	.suspend_interrupt = vmx_suspend_interrupt,
+	.suspend = vmx_suspend,
+	.resume = vmx_resume,
 	.capability = vmx_capability,
 	.mach_conf_max = NVMM_X86_MACH_NCONF,
 	.mach_conf_sizes = NULL,
@@ -3588,5 +3653,7 @@ const struct nvmm_impl nvmm_x86_vmx = {
 	.vcpu_setstate = vmx_vcpu_setstate,
 	.vcpu_getstate = vmx_vcpu_getstate,
 	.vcpu_inject = vmx_vcpu_inject,
-	.vcpu_run = vmx_vcpu_run
+	.vcpu_run = vmx_vcpu_run,
+	.vcpu_suspend = vmx_vcpu_suspend,
+	.vcpu_resume = vmx_vcpu_resume,
 };

Reply via email to