Tarun Sahu <[email protected]> writes:

> This patch introduces the freeze on gmem_inode which prevents

Can't find the reference now, but commit messages should take the
imperative mood and avoid "this patch" [*]

[*] https://lore.kernel.org/all/[email protected]/

> the fallocate call and any new page fault allocation. This will avoid
> gmem file modification when it is being preserved
>
> Used srcu lock to synchronise the freeze call, where write blocks
> until all the reads are free. And reads are re-entrant.
>
> Incase fault fails, It return -EPERM and VM_EXIT to userspace. userspace
> must handle this properly as every new fault will fail.
>
> Signed-off-by: Tarun Sahu <[email protected]>
>
> [...snip...]
>
> @@ -105,12 +108,20 @@ static struct folio *kvm_gmem_get_folio(struct inode 
> *inode, pgoff_t index)
>       if (!IS_ERR(folio))
>               return folio;
>
> +     idx = srcu_read_lock(&kvm_gmem_freeze_srcu);
> +     if (kvm_gmem_is_frozen(inode)) {
> +             srcu_read_unlock(&kvm_gmem_freeze_srcu, idx);
> +             return ERR_PTR(-EPERM);
> +     }
> +
>       policy = mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index);
>       folio = __filemap_get_folio_mpol(inode->i_mapping, index,
>                                        FGP_LOCK | FGP_CREAT,
>                                        mapping_gfp_mask(inode->i_mapping), 
> policy);
>       mpol_cond_put(policy);
>
> +     srcu_read_unlock(&kvm_gmem_freeze_srcu, idx);
> +
>       /*
>        * External interfaces like kvm_gmem_get_pfn() support dealing
>        * with hugepages to a degree, but internally, guest_memfd currently
> @@ -273,16 +284,30 @@ static long kvm_gmem_allocate(struct inode *inode, 
> loff_t offset, loff_t len)
>  static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
>                              loff_t len)
>  {
> +     struct inode *inode = file_inode(file);
>       int ret;
> +     int idx;
>
> -     if (!(mode & FALLOC_FL_KEEP_SIZE))
> -             return -EOPNOTSUPP;
> +     idx = srcu_read_lock(&kvm_gmem_freeze_srcu);
> +     if (kvm_gmem_is_frozen(inode)) {
> +             srcu_read_unlock(&kvm_gmem_freeze_srcu, idx);
> +             return -EPERM;
> +     }

fallocate may eventually go to kvm_gmem_get_folio(), so that would check
kvm_gmem_is_frozen() twice. Is this meant to catch the punch hole case?

>
> -     if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
> -             return -EOPNOTSUPP;
> +     if (!(mode & FALLOC_FL_KEEP_SIZE)) {
> +             ret = -EOPNOTSUPP;
> +             goto out;
> +     }
>
> -     if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
> -             return -EINVAL;
> +     if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) {
> +             ret = -EOPNOTSUPP;
> +             goto out;
> +     }
> +
> +     if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) {
> +             ret = -EINVAL;
> +             goto out;
> +     }

There's some reordering here. Why not let the validation happen like
before, then check kvm_gmem_is_frozen()?

>
>       if (mode & FALLOC_FL_PUNCH_HOLE)
>               ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
>
> [...snip...]
>
> +
> +/**
> + * kvm_gmem_freeze - Freeze or unfreeze a guest_memfd inode mapping.
> + * @inode: The guest_memfd inode.
> + * @freeze: True to freeze, false to unfreeze.
> + *
> + * This API is used strictly during the live update / preservation transition
> + * window to prevent host userspace and guest-side faults from making any
> + * mapping modifications (such as fallocate or page fault allocation)
> + * to the guest_memfd page cache.
> + *
> + * Synchronization Strategy (Sleepable RCU):
> + * To avoid high-contention VFS locks (like inode_lock or
> + * filemap_invalidate_lock) on the vCPU page fault hot paths, this subsystem
> + * implements a lightweight, system-wide Sleepable RCU (SRCU) mechanism
> + * (`kvm_gmem_freeze_srcu`):
> + *
> + * Global vs. Per-Inode SRCU
> + * ======================
> + * A single system-wide global static `srcu_struct` is used instead of a
> + * per-inode SRCU structure to completely prevent unprivileged users from
> + * exhausting the host's per-CPU memory allocator. Because
> + * `init_srcu_struct()` allocates per-CPU memory via `alloc_percpu()`, which
> + * is not accounted by memory cgroups (memcg),
> + * a per-inode SRCU structure would allow a tenant to bypass cgroup limits 
> and
> + * trigger a system-wide Out-of-Memory (OOM) crash simply by spawning a large
> + * number of guest_memfd file descriptors (bounded only by RLIMIT_NOFILE).
> + *
> + * Flag Modification Note:
> + * Since `GUEST_MEMFD_F_MAPPING_FROZEN` is the ONLY flag in
> + * `GMEM_I(inode)->flags` that is mutated dynamically at runtime (all other
> + * flags are creation-time flags which remain strictly read-only), there is
> + * no possibility of concurrent bit-modification races. Therefore, a standard
> + * `WRITE_ONCE` is fully safe and does not require complex `cmpxchg`
> + * synchronization loops.
> + */
> +void kvm_gmem_freeze(struct inode *inode, bool freeze)
> +{
> +     u64 flags = READ_ONCE(GMEM_I(inode)->flags);
> +
> +     if (freeze)
> +             flags |= GUEST_MEMFD_F_MAPPING_FROZEN;
> +     else
> +             flags &= ~GUEST_MEMFD_F_MAPPING_FROZEN;
> +
> +     WRITE_ONCE(GMEM_I(inode)->flags, flags);
> +
> +     if (freeze)
> +             synchronize_srcu(&kvm_gmem_freeze_srcu);

Why only synchronize on freeze but not unfreeze?

> +}
> +
>
> [...snip...]
>

Reply via email to