On Thu, May 28, 2026 at 8:17 AM Lorenzo Stoakes <[email protected]> wrote: > > On Sat, Apr 25, 2026 at 11:27:16PM -0700, Suren Baghdasaryan wrote: > > proc/pid/{smaps|numa_maps} can be read using the combination of RCU and > > VMA read locks, similar to proc/pid/maps. RCU is required to safely > > traverse the VMA tree and VMA lock stabilizes the VMA being processed > > and the pagetable walk. > > > > Signed-off-by: Suren Baghdasaryan <[email protected]> > > Reviewed-by: Liam R. Howlett <[email protected]> > > The logic LGTM, some nitty things below but in general: > > Reviewed-by: Lorenzo Stoakes <[email protected]>
Thanks! > > > --- > > fs/proc/task_mmu.c | 195 ++++++++++++++++++++++++++++++++++++--------- > > 1 file changed, 156 insertions(+), 39 deletions(-) > > > > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c > > index 751b9ba160fb..1e3a15bf46f4 100644 > > --- a/fs/proc/task_mmu.c > > +++ b/fs/proc/task_mmu.c > > @@ -132,6 +132,22 @@ static void release_task_mempolicy(struct > > proc_maps_private *priv) > > > > #ifdef CONFIG_PER_VMA_LOCK > > > > +static inline int lock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx) > > NIT: But these inlines are unnecessary in a C file. At the kernel optimisation > level the compiler already decides whether to inline or not. > > However in the kernel we redefine this so we lose function tracing and also we > don't get warnings about unused functions [0], so this is actually a bit > harmful > - you can end up with dead code and not know abou tit. > > Same goes for all the other static inline's in task_mmu.c. I see. There are a bunch of these small inline functions. Once the dust settles, I can clean them up. Dave's series and my folloup patch to change smaps_rollup will be changing this code. Maybe after all that is done? > > [0]: > https://elixir.bootlin.com/linux/v7.0.10/source/include/linux/compiler_types.h#L235 > > > +{ > > + int ret = mmap_read_lock_killable(lock_ctx->mm); > > + > > + if (!ret) > > + lock_ctx->mmap_locked = true; > > + > > + return ret; > > +} > > + > > +static inline void unlock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx) > > +{ > > + mmap_read_unlock(lock_ctx->mm); > > + lock_ctx->mmap_locked = false; > > +} > > + > > static void reset_lock_ctx(struct proc_maps_locking_ctx *lock_ctx) > > { > > lock_ctx->locked_vma = NULL; > > @@ -146,25 +162,11 @@ static void unlock_ctx_vma(struct > > proc_maps_locking_ctx *lock_ctx) > > } > > } > > > > -static const struct seq_operations proc_pid_maps_op; > > - > > static inline bool lock_vma_range(struct seq_file *m, > > struct proc_maps_locking_ctx *lock_ctx) > > { > > - /* > > - * smaps and numa_maps perform page table walk, therefore require > > - * mmap_lock but maps can be read with locking just the vma and > > - * walking the vma tree under rcu read protection. > > - */ > > - if (m->op != &proc_pid_maps_op) { > > - if (mmap_read_lock_killable(lock_ctx->mm)) > > - return false; > > - > > - lock_ctx->mmap_locked = true; > > - } else { > > - rcu_read_lock(); > > - reset_lock_ctx(lock_ctx); > > - } > > + rcu_read_lock(); > > + reset_lock_ctx(lock_ctx); > > > > return true; > > } > > @@ -172,7 +174,7 @@ static inline bool lock_vma_range(struct seq_file *m, > > static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx) > > { > > if (lock_ctx->mmap_locked) { > > - mmap_read_unlock(lock_ctx->mm); > > + unlock_ctx_mm(lock_ctx); > > } else { > > unlock_ctx_vma(lock_ctx); > > rcu_read_unlock(); > > @@ -213,17 +215,45 @@ static inline bool fallback_to_mmap_lock(struct > > proc_maps_private *priv, > > return true; > > } > > > > +static inline void drop_rcu(struct proc_maps_private *priv) > > +{ > > + if (priv->lock_ctx.mmap_locked) > > + return; > > + > > + rcu_read_unlock(); > > +} > > + > > +static inline void reacquire_rcu(struct proc_maps_private *priv) > > +{ > > + if (priv->lock_ctx.mmap_locked) > > + return; > > + > > + rcu_read_lock(); > > + /* Reinitialize the iterator. */ > > + vma_iter_set(&priv->iter, priv->lock_ctx.locked_vma->vm_end); > > OK good that we reset this :) > > > +} > > + > > #else /* CONFIG_PER_VMA_LOCK */ > > > > We need to pull out the patches from Dave's series and remove this option and > default VMA locks on by default :) I didn't see any followup after Dave posted his v1. I agree it would make things much simpler but I don't want to delay these changes too much. I'll ask Dave if he plans to continue his cleanup. If not, I'll pick it up. > > > +static inline int lock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx) > > +{ > > + return mmap_read_lock_killable(lock_ctx->mm); > > +} > > + > > +static inline void unlock_ctx_mm(struct proc_maps_locking_ctx *lock_ctx) > > +{ > > + mmap_read_unlock(lock_ctx->mm); > > +} > > + > > static inline bool lock_vma_range(struct seq_file *m, > > struct proc_maps_locking_ctx *lock_ctx) > > { > > - return mmap_read_lock_killable(lock_ctx->mm) == 0; > > + return lock_ctx_mm(lock_ctx) == 0; > > } > > > > static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx) > > { > > - mmap_read_unlock(lock_ctx->mm); > > + unlock_ctx_mm(lock_ctx); > > } > > > > static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, > > @@ -238,6 +268,9 @@ static inline bool fallback_to_mmap_lock(struct > > proc_maps_private *priv, > > return false; > > } > > > > +static inline void drop_rcu(struct proc_maps_private *priv) {} > > +static inline void reacquire_rcu(struct proc_maps_private *priv) {} > > + > > #endif /* CONFIG_PER_VMA_LOCK */ > > > > static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t > > *ppos) > > @@ -538,12 +571,10 @@ static int query_vma_setup(struct > > proc_maps_locking_ctx *lock_ctx) > > > > static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx) > > { > > - if (lock_ctx->mmap_locked) { > > - mmap_read_unlock(lock_ctx->mm); > > - lock_ctx->mmap_locked = false; > > - } else { > > + if (lock_ctx->mmap_locked) > > + unlock_ctx_mm(lock_ctx); > > + else > > unlock_ctx_vma(lock_ctx); > > - } > > } > > > > static struct vm_area_struct *query_vma_find_by_addr(struct > > proc_maps_locking_ctx *lock_ctx, > > @@ -1280,21 +1311,75 @@ static const struct mm_walk_ops > > smaps_shmem_walk_ops = { > > .walk_lock = PGWALK_RDLOCK, > > }; > > > > +#ifdef CONFIG_PER_VMA_LOCK > > + > > +static const struct mm_walk_ops smaps_walk_vma_lock_ops = { > > + .pmd_entry = smaps_pte_range, > > + .hugetlb_entry = smaps_hugetlb_range, > > + .walk_lock = PGWALK_VMA_RDLOCK_VERIFY, > > +}; > > + > > +static const struct mm_walk_ops smaps_shmem_walk_vma_lock_ops = { > > + .pmd_entry = smaps_pte_range, > > + .hugetlb_entry = smaps_hugetlb_range, > > + .pte_hole = smaps_pte_hole, > > + .walk_lock = PGWALK_VMA_RDLOCK_VERIFY, > > +}; > > + > > +static inline const struct mm_walk_ops * > > +get_smaps_walk_ops(struct proc_maps_private *priv) > > +{ > > + if (priv->lock_ctx.mmap_locked) > > + return &smaps_walk_ops; > > + return &smaps_walk_vma_lock_ops; > > +} > > + > > +static inline const struct mm_walk_ops * > > +get_smaps_shmem_walk_ops(struct proc_maps_private *priv) > > +{ > > + if (priv->lock_ctx.mmap_locked) > > + return &smaps_shmem_walk_ops; > > + return &smaps_shmem_walk_vma_lock_ops; > > +} > > + > > +#else /* CONFIG_PER_VMA_LOCK */ > > + > > +static inline const struct mm_walk_ops * > > +get_smaps_walk_ops(struct proc_maps_private *priv) > > +{ > > + return &smaps_walk_ops; > > +} > > + > > +static inline const struct mm_walk_ops * > > +get_smaps_shmem_walk_ops(struct proc_maps_private *priv) > > +{ > > + return &smaps_shmem_walk_ops; > > +} > > + > > +#endif /* CONFIG_PER_VMA_LOCK */ > > + > > /* > > * Gather mem stats from @vma with the indicated beginning > > * address @start, and keep them in @mss. > > * > > * Use vm_start of @vma as the beginning address if @start is 0. > > */ > > -static void smap_gather_stats(struct vm_area_struct *vma, > > - struct mem_size_stats *mss, unsigned long start) > > +static void smap_gather_stats(struct proc_maps_private *priv, > > + struct vm_area_struct *vma, > > + struct mem_size_stats *mss, unsigned long start) > > { > > - const struct mm_walk_ops *ops = &smaps_walk_ops; > > + const struct mm_walk_ops *ops = get_smaps_walk_ops(priv); > > > > /* Invalid start */ > > if (start >= vma->vm_end) > > return; > > > > + if (vma == get_gate_vma(priv->lock_ctx.mm)) > > + return; > > + > > + /* Might sleep. Drop RCU read lock but keep the VMA locked. */ > > + drop_rcu(priv); > > + > > if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { > > /* > > * For shared or readonly shmem mappings we know that all > > @@ -1312,15 +1397,16 @@ static void smap_gather_stats(struct vm_area_struct > > *vma, > > !(vma->vm_flags & VM_WRITE))) { > > mss->swap += shmem_swapped; > > } else { > > - ops = &smaps_shmem_walk_ops; > > + ops = get_smaps_shmem_walk_ops(priv); > > } > > } > > > > - /* mmap_lock is held in m_start */ > > if (!start) > > walk_page_vma(vma, ops, mss); > > else > > walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss); > > + > > + reacquire_rcu(priv); > > OK I was going to say that this RCU lock dance is a bit iffy, but I see > that we're sharing code such that this is necessary. > > > } > > > > #define SEQ_PUT_DEC(str, val) \ > > @@ -1369,10 +1455,11 @@ static void __show_smap(struct seq_file *m, const > > struct mem_size_stats *mss, > > > > static int show_smap(struct seq_file *m, void *v) > > { > > + struct proc_maps_private *priv = m->private; > > struct vm_area_struct *vma = v; > > struct mem_size_stats mss = {}; > > > > - smap_gather_stats(vma, &mss, 0); > > + smap_gather_stats(priv, vma, &mss, 0); > > > > show_map_vma(m, vma); > > > > @@ -1413,7 +1500,7 @@ static int show_smaps_rollup(struct seq_file *m, void > > *v) > > goto out_put_task; > > } > > > > - ret = mmap_read_lock_killable(mm); > > + ret = lock_ctx_mm(&priv->lock_ctx); > > if (ret) > > goto out_put_mm; > > > > @@ -1425,7 +1512,7 @@ static int show_smaps_rollup(struct seq_file *m, void > > *v) > > > > vma_start = vma->vm_start; > > do { > > - smap_gather_stats(vma, &mss, 0); > > + smap_gather_stats(priv, vma, &mss, 0); > > last_vma_end = vma->vm_end; > > > > /* > > @@ -1434,8 +1521,8 @@ static int show_smaps_rollup(struct seq_file *m, void > > *v) > > */ > > if (mmap_lock_is_contended(mm)) { > > vma_iter_invalidate(&vmi); > > - mmap_read_unlock(mm); > > - ret = mmap_read_lock_killable(mm); > > + unlock_ctx_mm(&priv->lock_ctx); > > + ret = lock_ctx_mm(&priv->lock_ctx); > > if (ret) { > > release_task_mempolicy(priv); > > goto out_put_mm; > > @@ -1484,14 +1571,14 @@ static int show_smaps_rollup(struct seq_file *m, > > void *v) > > > > /* Case 1 and 2 above */ > > if (vma->vm_start >= last_vma_end) { > > - smap_gather_stats(vma, &mss, 0); > > + smap_gather_stats(priv, vma, &mss, 0); > > last_vma_end = vma->vm_end; > > continue; > > } > > > > /* Case 4 above */ > > if (vma->vm_end > last_vma_end) { > > - smap_gather_stats(vma, &mss, last_vma_end); > > + smap_gather_stats(priv, vma, &mss, > > last_vma_end); > > last_vma_end = vma->vm_end; > > } > > } > > @@ -1505,7 +1592,7 @@ static int show_smaps_rollup(struct seq_file *m, void > > *v) > > __show_smap(m, &mss, true); > > > > release_task_mempolicy(priv); > > - mmap_read_unlock(mm); > > + unlock_ctx_mm(&priv->lock_ctx); > > > > out_put_mm: > > mmput(mm); > > @@ -3291,6 +3378,31 @@ static const struct mm_walk_ops show_numa_ops = { > > .walk_lock = PGWALK_RDLOCK, > > }; > > > > +#ifdef CONFIG_PER_VMA_LOCK > > +static const struct mm_walk_ops show_numa_vma_lock_ops = { > > + .hugetlb_entry = gather_hugetlb_stats, > > + .pmd_entry = gather_pte_stats, > > + .walk_lock = PGWALK_VMA_RDLOCK_VERIFY, > > +}; > > + > > +static inline const struct mm_walk_ops * > > +get_show_numa_ops(struct proc_maps_private *priv) > > +{ > > + if (priv->lock_ctx.mmap_locked) > > + return &show_numa_ops; > > + return &show_numa_vma_lock_ops; > > +} > > + > > +#else /* CONFIG_PER_VMA_LOCK */ > > + > > +static inline const struct mm_walk_ops * > > +get_show_numa_ops(struct proc_maps_private *priv) > > +{ > > + return &show_numa_ops; > > +} > > + > > +#endif /* CONFIG_PER_VMA_LOCK */ > > + > > /* > > * Display pages allocated per node and memory policy via /proc. > > */ > > @@ -3335,8 +3447,13 @@ static int show_numa_map(struct seq_file *m, void *v) > > if (is_vm_hugetlb_page(vma)) > > seq_puts(m, " huge"); > > > > - /* mmap_lock is held by m_start */ > > - walk_page_vma(vma, &show_numa_ops, md); > > + /* Skip walking pages if gate VMA */ > > + if (vma != get_gate_vma(proc_priv->lock_ctx.mm)) { > > + /* Might sleep. Drop RCU read lock but keep the VMA locked. */ > > + drop_rcu(proc_priv); > > + walk_page_vma(vma, get_show_numa_ops(proc_priv), md); > > + reacquire_rcu(proc_priv); > > + } > > > > if (!md->pages) > > goto out; > > -- > > 2.54.0.545.g6539524ca2-goog > >

