+cc linux-api - see the description of the new behavior below

On 7/11/25 13:38, Lorenzo Stoakes wrote:
> Historically we've made it a uAPI requirement that mremap() may only
> operate on a single VMA at a time.
> 
> For instances where VMAs need to be resized, this makes sense, as it
> becomes very difficult to determine what a user actually wants should they
> indicate a desire to expand or shrink the size of multiple VMAs (truncate?
> Adjust sizes individually? Some other strategy?).
> 
> However, in instances where a user is moving VMAs, it is restrictive to
> disallow this.
> 
> This is especially the case when anonymous mapping remap may or may not be
> mergeable depending on whether VMAs have or have not been faulted due to
> anon_vma assignment and folio index alignment with vma->vm_pgoff.
> 
> Often this can result in surprising impact where a moved region is faulted,
> then moved back and a user fails to observe a merge from otherwise
> compatible, adjacent VMAs.
> 
> This change allows such cases to work without the user having to be
> cognizant of whether a prior mremap() move or other VMA operations has
> resulted in VMA fragmentation.
> 
> We only permit this for mremap() operations that do NOT change the size of
> the VMA and DO specify MREMAP_MAYMOVE | MREMAP_FIXED.
> 
> Should no VMA exist in the range, -EFAULT is returned as usual.
> 
> If a VMA move spans a single VMA - then there is no functional change.
> 
> Otherwise, we place additional requirements upon VMAs:
> 
> * They must not have a userfaultfd context associated with them - this
>   requires dropping the lock to notify users, and we want to perform the
>   operation with the mmap write lock held throughout.
> 
> * If file-backed, they cannot have a custom get_unmapped_area handler -
>   this might result in MREMAP_FIXED not being honoured, which could result
>   in unexpected positioning of VMAs in the moved region.
> 
> There may be gaps in the range of VMAs that are moved:
> 
>                    X        Y                       X        Y
>                  <--->     <->                    <--->     <->
>          |-------|   |-----| |-----|      |-------|   |-----| |-----|
>          |   A   |   |  B  | |  C  | ---> |   A'  |   |  B' | |  C' |
>          |-------|   |-----| |-----|      |-------|   |-----| |-----|
>         addr                           new_addr
> 
> The move will preserve the gaps between each VMA.

AFAIU "moving a gap" doesn't mean we unmap anything pre-existing where the
moved gap's range falls to, right? Worth pointing out explicitly.

> Note that any failures encountered will result in a partial move. Since an
> mremap() can fail at any time, this might result in only some of the VMAs
> being moved.
> 
> Note that failures are very rare and typically require an out of a memory
> condition or a mapping limit condition to be hit, assuming the VMAs being
> moved are valid.
> 
> We don't try to assess ahead of time whether VMAs are valid according to
> the multi VMA rules, as it would be rather unusual for a user to mix
> uffd-enabled VMAs and/or VMAs which map unusual driver mappings that
> specify custom get_unmapped_area() handlers in an aggregate operation.
> 
> So we optimise for the far, far more likely case of the operation being
> entirely permissible.

Guess it's the sanest thing to do given all the cirumstances.

> In the case of the move of a single VMA, the above conditions are
> permitted. This makes the behaviour identical for a single VMA as before.
> 
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoa...@oracle.com>

Reviewed-by: Vlastimil Babka <vba...@suse.cz>

Some nits:

> ---
>  mm/mremap.c | 157 +++++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 150 insertions(+), 7 deletions(-)
> 
> diff --git a/mm/mremap.c b/mm/mremap.c
> index 8cb08ccea6ad..59f49de0f84e 100644
> --- a/mm/mremap.c
> +++ b/mm/mremap.c
> @@ -69,6 +69,8 @@ struct vma_remap_struct {
>       enum mremap_type remap_type;    /* expand, shrink, etc. */
>       bool mmap_locked;               /* Is mm currently write-locked? */
>       unsigned long charged;          /* If VM_ACCOUNT, # pages to account. */
> +     bool seen_vma;                  /* Is >1 VMA being moved? */

Seems this could be local variable of remap_move().

> +     bool vmi_needs_reset;           /* Was the VMA iterator invalidated? */
>  };
>  
>  static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
> @@ -1188,6 +1190,9 @@ static int copy_vma_and_data(struct vma_remap_struct 
> *vrm,
>               *new_vma_ptr = NULL;
>               return -ENOMEM;
>       }

A newline here?

> +     if (vma != vrm->vma)
> +             vrm->vmi_needs_reset = true;

A comment on what this condition means wouldn't hurt? Is it when "Source vma
may have been merged into new_vma" in copy_vma(), or when not?

(no comments bellow, remaining quoted part left for linux-api reference)

> +
>       vrm->vma = vma;
>       pmc.old = vma;
>       pmc.new = new_vma;
> @@ -1583,6 +1588,18 @@ static bool vrm_will_map_new(struct vma_remap_struct 
> *vrm)
>       return false;
>  }
>  
> +/* Does this remap ONLY move mappings? */
> +static bool vrm_move_only(struct vma_remap_struct *vrm)
> +{
> +     if (!(vrm->flags & MREMAP_FIXED))
> +             return false;
> +
> +     if (vrm->old_len != vrm->new_len)
> +             return false;
> +
> +     return true;
> +}
> +
>  static void notify_uffd(struct vma_remap_struct *vrm, bool failed)
>  {
>       struct mm_struct *mm = current->mm;
> @@ -1597,6 +1614,32 @@ static void notify_uffd(struct vma_remap_struct *vrm, 
> bool failed)
>       userfaultfd_unmap_complete(mm, vrm->uf_unmap);
>  }
>  
> +static bool vma_multi_allowed(struct vm_area_struct *vma)
> +{
> +     struct file *file;
> +
> +     /*
> +      * We can't support moving multiple uffd VMAs as notify requires
> +      * mmap lock to be dropped.
> +      */
> +     if (userfaultfd_armed(vma))
> +             return false;
> +
> +     /*
> +      * Custom get unmapped area might result in MREMAP_FIXED not
> +      * being obeyed.
> +      */
> +     file = vma->vm_file;
> +     if (file && !vma_is_shmem(vma) && !is_vm_hugetlb_page(vma)) {
> +             const struct file_operations *fop = file->f_op;
> +
> +             if (fop->get_unmapped_area)
> +                     return false;
> +     }
> +
> +     return true;
> +}
> +
>  static int check_prep_vma(struct vma_remap_struct *vrm)
>  {
>       struct vm_area_struct *vma = vrm->vma;
> @@ -1646,7 +1689,19 @@ static int check_prep_vma(struct vma_remap_struct *vrm)
>                       (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
>               return -EINVAL;
>  
> -     /* We can't remap across vm area boundaries */
> +     /*
> +      * We can't remap across the end of VMAs, as another VMA may be
> +      * adjacent:
> +      *
> +      *       addr   vma->vm_end
> +      *  |-----.----------|
> +      *  |     .          |
> +      *  |-----.----------|
> +      *        .<--------->xxx>
> +      *            old_len
> +      *
> +      * We also require that vma->vm_start <= addr < vma->vm_end.
> +      */
>       if (old_len > vma->vm_end - addr)
>               return -EFAULT;
>  
> @@ -1746,6 +1801,90 @@ static unsigned long check_mremap_params(struct 
> vma_remap_struct *vrm)
>       return 0;
>  }
>  
> +static unsigned long remap_move(struct vma_remap_struct *vrm)
> +{
> +     struct vm_area_struct *vma;
> +     unsigned long start = vrm->addr;
> +     unsigned long end = vrm->addr + vrm->old_len;
> +     unsigned long new_addr = vrm->new_addr;
> +     unsigned long target_addr = new_addr;
> +     unsigned long res = -EFAULT;
> +     unsigned long last_end;
> +     bool allowed = true;
> +     VMA_ITERATOR(vmi, current->mm, start);
> +
> +     /*
> +      * When moving VMAs we allow for batched moves across multiple VMAs,
> +      * with all VMAs in the input range [addr, addr + old_len) being moved
> +      * (and split as necessary).
> +      */
> +     for_each_vma_range(vmi, vma, end) {
> +             /* Account for start, end not aligned with VMA start, end. */
> +             unsigned long addr = max(vma->vm_start, start);
> +             unsigned long len = min(end, vma->vm_end) - addr;
> +             unsigned long offset, res_vma;
> +
> +             if (!allowed)
> +                     return -EFAULT;
> +
> +             /* No gap permitted at the start of the range. */
> +             if (!vrm->seen_vma && start < vma->vm_start)
> +                     return -EFAULT;
> +
> +             /*
> +              * To sensibly move multiple VMAs, accounting for the fact that
> +              * get_unmapped_area() may align even MAP_FIXED moves, we simply
> +              * attempt to move such that the gaps between source VMAs remain
> +              * consistent in destination VMAs, e.g.:
> +              *
> +              *           X        Y                       X        Y
> +              *         <--->     <->                    <--->     <->
> +              * |-------|   |-----| |-----|      |-------|   |-----| |-----|
> +              * |   A   |   |  B  | |  C  | ---> |   A'  |   |  B' | |  C' |
> +              * |-------|   |-----| |-----|      |-------|   |-----| |-----|
> +              *                               new_addr
> +              *
> +              * So we map B' at A'->vm_end + X, and C' at B'->vm_end + Y.
> +              */
> +             offset = vrm->seen_vma ? vma->vm_start - last_end : 0;
> +             last_end = vma->vm_end;
> +
> +             vrm->vma = vma;
> +             vrm->addr = addr;
> +             vrm->new_addr = target_addr + offset;
> +             vrm->old_len = vrm->new_len = len;
> +
> +             allowed = vma_multi_allowed(vma);
> +             if (vrm->seen_vma && !allowed)
> +                     return -EFAULT;
> +
> +             res_vma = check_prep_vma(vrm);
> +             if (!res_vma)
> +                     res_vma = mremap_to(vrm);
> +             if (IS_ERR_VALUE(res_vma))
> +                     return res_vma;
> +
> +             if (!vrm->seen_vma) {
> +                     VM_WARN_ON_ONCE(allowed && res_vma != new_addr);
> +                     res = res_vma;
> +             }
> +
> +             /* mmap lock is only dropped on shrink. */
> +             VM_WARN_ON_ONCE(!vrm->mmap_locked);
> +             /* This is a move, no expand should occur. */
> +             VM_WARN_ON_ONCE(vrm->populate_expand);
> +
> +             if (vrm->vmi_needs_reset) {
> +                     vma_iter_reset(&vmi);
> +                     vrm->vmi_needs_reset = false;
> +             }
> +             vrm->seen_vma = true;
> +             target_addr = res_vma + vrm->new_len;
> +     }
> +
> +     return res;
> +}
> +
>  static unsigned long do_mremap(struct vma_remap_struct *vrm)
>  {
>       struct mm_struct *mm = current->mm;
> @@ -1763,13 +1902,17 @@ static unsigned long do_mremap(struct 
> vma_remap_struct *vrm)
>               return -EINTR;
>       vrm->mmap_locked = true;
>  
> -     vrm->vma = vma_lookup(current->mm, vrm->addr);
> -     res = check_prep_vma(vrm);
> -     if (res)
> -             goto out;
> +     if (vrm_move_only(vrm)) {
> +             res = remap_move(vrm);
> +     } else {
> +             vrm->vma = vma_lookup(current->mm, vrm->addr);
> +             res = check_prep_vma(vrm);
> +             if (res)
> +                     goto out;
>  
> -     /* Actually execute mremap. */
> -     res = vrm_implies_new_addr(vrm) ? mremap_to(vrm) : mremap_at(vrm);
> +             /* Actually execute mremap. */
> +             res = vrm_implies_new_addr(vrm) ? mremap_to(vrm) : 
> mremap_at(vrm);
> +     }
>  
>  out:
>       failed = IS_ERR_VALUE(res);


Reply via email to