Ackerley Tng <[email protected]> writes:
> Tarun Sahu <[email protected]> writes:
>
>> Introduce core infrastructure to support VM preservation with LUO.
>>
>> First two changes are just refactoring, no functional change, third
>> change introduces a new member in struct kvm.
>> - Move ITOA_MAX_LEN to kvm_mm.h for reuse by upcoming kvm_luo code.
>> - Add a public kvm_create_vm_file() helper wrapping kvm_create_vm()
>> and anon_inode_getfile() to provide a unified VM file creation API.
>> - Track a weak reference to the backing file in struct kvm under
>> CONFIG_LIVEUPDATE_GUEST_MEMFD to enable reverse file resolution
>> without circular lifetime dependencies.
>>
>
> Given the above, I think this should be separate patches.
>
>> Signed-off-by: Tarun Sahu <[email protected]>
>> ---
>> include/linux/kvm_host.h | 14 +++++++
>> virt/kvm/kvm_main.c | 79 +++++++++++++++++++++++++++++-----------
>> virt/kvm/kvm_mm.h | 3 ++
>> 3 files changed, 75 insertions(+), 21 deletions(-)
>>
>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>> index 4c14aee1fb06..9111a28637af 100644
>> --- a/include/linux/kvm_host.h
>> +++ b/include/linux/kvm_host.h
>> @@ -874,6 +874,18 @@ struct kvm {
>> #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
>> /* Protected by slots_lock (for writes) and RCU (for reads) */
>> struct xarray mem_attr_array;
>> +#endif
>> +#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
>> + /*
>> + * Weak reference to the VFS file backing this KVM instance. Stored
>> + * without incrementing the file refcount to prevent a circular lifetime
>> + * dependency (since file->private_data already pins this struct kvm).
>> + * Used exclusively to resolve the file pointer back from struct kvm.
>> + *
>> + * Written/cleared via rcu_assign_pointer() and read locklessly under
>> + * RCU (e.g. via get_file_active() to prevent ABA races).
>> + */
>> + struct file *vm_file;
>> #endif
>
> We didn't really talk about this during the calls, but it seems weird to
> preserve a vm_file with pretty much nothing other than the vm type. The
> entire VM is re-created, which means it could potentially be a
> completely different VM?
>
> In some sense it's more flexible since the guest_memfd can be restored
> with some completely different VM, but it seems like it could introduce
> other issues.
>
> I think other KVM folks would probably have more thoughts here.
>
>> char stats_id[KVM_STATS_NAME_SIZE];
>> };
>> @@ -1074,7 +1086,9 @@ void kvm_get_kvm(struct kvm *kvm);
>> bool kvm_get_kvm_safe(struct kvm *kvm);
>> void kvm_put_kvm(struct kvm *kvm);
>> bool file_is_kvm(struct file *file);
>> +struct file *kvm_create_vm_file(unsigned long type, const char *fdname);
>> void kvm_put_kvm_no_destroy(struct kvm *kvm);
>> +void kvm_uevent_notify_vm_create(struct kvm *kvm);
>>
>> static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int
>> as_id)
>> {
>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>> index 89489996fbc1..65f0c5fb353e 100644
>> --- a/virt/kvm/kvm_main.c
>> +++ b/virt/kvm/kvm_main.c
>> @@ -67,9 +67,6 @@
>> #include <linux/kvm_dirty_ring.h>
>>
>>
>> -/* Worst case buffer size needed for holding an integer. */
>> -#define ITOA_MAX_LEN 12
>> -
>> MODULE_AUTHOR("Qumranet");
>> MODULE_DESCRIPTION("Kernel-based Virtual Machine (KVM) Hypervisor");
>> MODULE_LICENSE("GPL");
>> @@ -1349,6 +1346,19 @@ static int kvm_vm_release(struct inode *inode, struct
>> file *filp)
>> {
>> struct kvm *kvm = filp->private_data;
>>
>> +#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
>> + /*
>> + * Clear the weak reference of the vm file.
>> + * In case vm file is closed by userspace, but kvm still has
>> + * other users like vCPUs, clearing this pointer ensures
>> + * that we don't have a dangling pointer to a closed file.
>> + *
>> + * Cleared via rcu_assign_pointer() to ensure proper memory visibility
>> + * for concurrent lockless readers under RCU.
>> + */
>> + rcu_assign_pointer(kvm->vm_file, NULL);
>> +#endif
>> +
>> kvm_irqfd_release(kvm);
>>
>> kvm_put_kvm(kvm);
>> @@ -5476,11 +5486,47 @@ bool file_is_kvm(struct file *file)
>> }
>> EXPORT_SYMBOL_FOR_KVM_INTERNAL(file_is_kvm);
>>
>> +struct file *kvm_create_vm_file(unsigned long type, const char *fdname)
>> +{
>> + struct kvm *kvm = kvm_create_vm(type, fdname);
>> + struct file *file;
>> +
>> + if (IS_ERR(kvm))
>> + return ERR_CAST(kvm);
>> +
>> + file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
>> + if (IS_ERR(file)) {
>> + kvm_put_kvm(kvm);
>> + return file;
>> + }
>> +
>> +#ifdef CONFIG_LIVEUPDATE_GUEST_MEMFD
>> + /*
>> + * Weak reference to the file (without get_file()) to prevent a circular
>> + * dependency. Safe because the file's release path clears this pointer
>> + * and drops its reference to the VM.
>> + *
>> + * Written via rcu_assign_pointer() because the pointer can be read
>> + * locklessly under RCU (e.g., in kvm_gmem_luo_preserve() via
>> + * get_file_active() to prevent lockless ABA races).
>> + */
>> + rcu_assign_pointer(kvm->vm_file, file);
>> +#endif
>> +
>> + /*
>> + * Don't call kvm_put_kvm anymore at this point; file->f_op is
>> + * already set, with ->release() being kvm_vm_release(). In error
>> + * cases it will be called by the final fput(file) and will take
>> + * care of doing kvm_put_kvm(kvm).
>> + */
>> +
>> + return file;
>> +}
>> +
>> static int kvm_dev_ioctl_create_vm(unsigned long type)
>> {
>> char fdname[ITOA_MAX_LEN + 1];
>> int r, fd;
>> - struct kvm *kvm;
>> struct file *file;
>>
>> fd = get_unused_fd_flags(O_CLOEXEC);
>> @@ -5489,31 +5535,17 @@ static int kvm_dev_ioctl_create_vm(unsigned long
>> type)
>>
>> snprintf(fdname, sizeof(fdname), "%d", fd);
>>
>> - kvm = kvm_create_vm(type, fdname);
>> - if (IS_ERR(kvm)) {
>> - r = PTR_ERR(kvm);
>> - goto put_fd;
>> - }
>> -
>> - file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
>> + file = kvm_create_vm_file(type, fdname);
>> if (IS_ERR(file)) {
>> r = PTR_ERR(file);
>> - goto put_kvm;
>> + goto put_fd;
>> }
>>
>> - /*
>> - * Don't call kvm_put_kvm anymore at this point; file->f_op is
>> - * already set, with ->release() being kvm_vm_release(). In error
>> - * cases it will be called by the final fput(file) and will take
>> - * care of doing kvm_put_kvm(kvm).
>> - */
>> - kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
>> + kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, file->private_data);
>
> Notifying with file->private_data threw me off... I would rather inline
> the rcu_assign_pointer() in this function and have this line read
> notify(..., kvm) like before.
Yes, I will update it to:
struct kvm *kvm;
...
kvm = file->private_data;
notify (..., kvm);
About rcu_assign_pointer, I am not sure, IIUC. that only set the
kvm->vm_file pointer, which does not have any relation with
file->private_data. And keeping the rcu_assign_pointer(kvm->vm_file,
file) at the current place (inside kvm_create_vm_file) logically makes
sense. because, kvm_create_vm_file creates the struct kvm and vm_file,
So, all the relation variables should get updated there.
>
>>
>> fd_install(fd, file);
>> return fd;
>>
>> -put_kvm:
>> - kvm_put_kvm(kvm);
>> put_fd:
>> put_unused_fd(fd);
>> return r;
>> @@ -6341,6 +6373,11 @@ static void kvm_uevent_notify_change(unsigned int
>> type, struct kvm *kvm)
>> kfree(env);
>> }
>>
>> +void kvm_uevent_notify_vm_create(struct kvm *kvm)
>> +{
>> + kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
>> +}
>> +
>> static void kvm_init_debug(void)
>> {
>> const struct file_operations *fops;
>> diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
>> index 9fcc5d5b7f8d..7aa1d65c3d46 100644
>> --- a/virt/kvm/kvm_mm.h
>> +++ b/virt/kvm/kvm_mm.h
>> @@ -3,6 +3,9 @@
>> #ifndef __KVM_MM_H__
>> #define __KVM_MM_H__ 1
>>
>> +/* Worst case buffer size needed for holding an integer as a string. */
>> +#define ITOA_MAX_LEN 12
>> +
>> /*
>> * Architectures can choose whether to use an rwlock or spinlock
>> * for the mmu_lock. These macros, for use in common code
>> --
>> 2.54.0.1032.g2f8565e1d1-goog