On 10/17/25 22:11, Ackerley Tng wrote: > From: Sean Christopherson <[email protected]> > > Start plumbing in guest_memfd support for in-place private<=>shared > conversions by tracking attributes via a maple tree. KVM currently tracks > private vs. shared attributes on a per-VM basis, which made sense when a > guest_memfd _only_ supported private memory, but tracking per-VM simply > can't work for in-place conversions as the shareability of a given page > needs to be per-gmem_inode, not per-VM. > > Use the filemap invalidation lock to protect the maple tree, as taking the > lock for read when faulting in memory (for userspace or the guest) isn't > expected to result in meaningful contention, and using a separate lock > would add significant complexity (avoid deadlock is quite difficult).
+Cc Liam and maple-tree list, especially for this part. > Signed-off-by: Sean Christopherson <[email protected]> > Co-developed-by: Ackerley Tng <[email protected]> > Signed-off-by: Ackerley Tng <[email protected]> > Co-developed-by: Vishal Annapurve <[email protected]> > Signed-off-by: Vishal Annapurve <[email protected]> > Co-developed-by: Fuad Tabba <[email protected]> > Signed-off-by: Fuad Tabba <[email protected]> > --- > virt/kvm/guest_memfd.c | 119 +++++++++++++++++++++++++++++++++++------ > 1 file changed, 103 insertions(+), 16 deletions(-) > > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c > index b22caa8b530ab..26cec833766c3 100644 > --- a/virt/kvm/guest_memfd.c > +++ b/virt/kvm/guest_memfd.c > @@ -4,6 +4,7 @@ > #include <linux/falloc.h> > #include <linux/fs.h> > #include <linux/kvm_host.h> > +#include <linux/maple_tree.h> > #include <linux/mempolicy.h> > #include <linux/pseudo_fs.h> > #include <linux/pagemap.h> > @@ -32,6 +33,7 @@ struct gmem_inode { > struct inode vfs_inode; > > u64 flags; > + struct maple_tree attributes; > }; > > static __always_inline struct gmem_inode *GMEM_I(struct inode *inode) > @@ -54,6 +56,23 @@ static inline kvm_pfn_t folio_file_pfn(struct folio > *folio, pgoff_t index) > return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1)); > } > > +static u64 kvm_gmem_get_attributes(struct inode *inode, pgoff_t index) > +{ > + void *entry = mtree_load(&GMEM_I(inode)->attributes, index); > + > + return WARN_ON_ONCE(!entry) ? 0 : xa_to_value(entry); > +} > + > +static bool kvm_gmem_is_private_mem(struct inode *inode, pgoff_t index) > +{ > + return kvm_gmem_get_attributes(inode, index) & > KVM_MEMORY_ATTRIBUTE_PRIVATE; > +} > + > +static bool kvm_gmem_is_shared_mem(struct inode *inode, pgoff_t index) > +{ > + return !kvm_gmem_is_private_mem(inode, index); > +} > + > static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot > *slot, > pgoff_t index, struct folio *folio) > { > @@ -415,10 +434,13 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct > vm_fault *vmf) > if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) > return VM_FAULT_SIGBUS; > > - if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED)) > - return VM_FAULT_SIGBUS; > + filemap_invalidate_lock_shared(inode->i_mapping); > + if (kvm_gmem_is_shared_mem(inode, vmf->pgoff)) > + folio = kvm_gmem_get_folio(inode, vmf->pgoff); > + else > + folio = ERR_PTR(-EACCES); > + filemap_invalidate_unlock_shared(inode->i_mapping); > > - folio = kvm_gmem_get_folio(inode, vmf->pgoff); > if (IS_ERR(folio)) { > if (PTR_ERR(folio) == -EAGAIN) > return VM_FAULT_RETRY; > @@ -572,6 +594,46 @@ bool __weak kvm_arch_supports_gmem_init_shared(struct > kvm *kvm) > return true; > } > > +static int kvm_gmem_init_inode(struct inode *inode, loff_t size, u64 flags) > +{ > + struct gmem_inode *gi = GMEM_I(inode); > + MA_STATE(mas, &gi->attributes, 0, (size >> PAGE_SHIFT) - 1); > + u64 attrs; > + int r; > + > + inode->i_op = &kvm_gmem_iops; > + inode->i_mapping->a_ops = &kvm_gmem_aops; > + inode->i_mode |= S_IFREG; > + inode->i_size = size; > + mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); > + mapping_set_inaccessible(inode->i_mapping); > + /* Unmovable mappings are supposed to be marked unevictable as well. */ > + WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); > + > + gi->flags = flags; > + > + mt_set_external_lock(&gi->attributes, > + &inode->i_mapping->invalidate_lock); > + > + /* > + * Store default attributes for the entire gmem instance. Ensuring every > + * index is represented in the maple tree at all times simplifies the > + * conversion and merging logic. > + */ > + attrs = gi->flags & GUEST_MEMFD_FLAG_INIT_SHARED ? 0 : > KVM_MEMORY_ATTRIBUTE_PRIVATE; > + > + /* > + * Acquire the invalidation lock purely to make lockdep happy. There > + * should be no races at this time since the inode hasn't yet been fully > + * created. > + */ > + filemap_invalidate_lock(inode->i_mapping); > + r = mas_store_gfp(&mas, xa_mk_value(attrs), GFP_KERNEL); > + filemap_invalidate_unlock(inode->i_mapping); > + > + return r; > +} > + > static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) > { > static const char *name = "[kvm-gmem]"; > @@ -602,16 +664,9 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t > size, u64 flags) > goto err_fops; > } > > - inode->i_op = &kvm_gmem_iops; > - inode->i_mapping->a_ops = &kvm_gmem_aops; > - inode->i_mode |= S_IFREG; > - inode->i_size = size; > - mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); > - mapping_set_inaccessible(inode->i_mapping); > - /* Unmovable mappings are supposed to be marked unevictable as well. */ > - WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); > - > - GMEM_I(inode)->flags = flags; > + err = kvm_gmem_init_inode(inode, size, flags); > + if (err) > + goto err_inode; > > file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, > &kvm_gmem_fops); > if (IS_ERR(file)) { > @@ -798,9 +853,13 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct > kvm_memory_slot *slot, > if (!file) > return -EFAULT; > > + filemap_invalidate_lock_shared(file_inode(file)->i_mapping); > + > folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, > max_order); > - if (IS_ERR(folio)) > - return PTR_ERR(folio); > + if (IS_ERR(folio)) { > + r = PTR_ERR(folio); > + goto out; > + } > > if (!is_prepared) > r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio); > @@ -812,6 +871,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct > kvm_memory_slot *slot, > else > folio_put(folio); > > +out: > + filemap_invalidate_unlock_shared(file_inode(file)->i_mapping); > return r; > } > EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn); > @@ -925,13 +986,39 @@ static struct inode *kvm_gmem_alloc_inode(struct > super_block *sb) > > mpol_shared_policy_init(&gi->policy, NULL); > > + /* > + * Memory attributes are protected the filemap invalidation lock, but > + * the lock structure isn't available at this time. Immediately mark > + * maple tree as using external locking so that accessing the tree > + * before its fully initialized results in NULL pointer dereferences > + * and not more subtle bugs. > + */ > + mt_init_flags(&gi->attributes, MT_FLAGS_LOCK_EXTERN); > + > gi->flags = 0; > return &gi->vfs_inode; > } > > static void kvm_gmem_destroy_inode(struct inode *inode) > { > - mpol_free_shared_policy(&GMEM_I(inode)->policy); > + struct gmem_inode *gi = GMEM_I(inode); > + > + mpol_free_shared_policy(&gi->policy); > + > + /* > + * Note! Checking for an empty tree is functionally necessary to avoid > + * explosions if the tree hasn't been initialized, i.e. if the inode is > + * being destroyed before guest_memfd can set the external lock. > + */ > + if (!mtree_empty(&gi->attributes)) { > + /* > + * Acquire the invalidation lock purely to make lockdep happy, > + * the inode is unreachable at this point. > + */ > + filemap_invalidate_lock(inode->i_mapping); > + __mt_destroy(&gi->attributes); > + filemap_invalidate_unlock(inode->i_mapping); > + } > } > > static void kvm_gmem_free_inode(struct inode *inode) > -- > 2.51.0.858.gf9c4a03a3a-goog
