On Fri, Feb 21, 2025 at 06:02:49PM +0100, Michal Koutný wrote:
> Noone would need to use this allocation strategy (it's slower, pid
> numbers collide sooner). Its primary purpose are pid namespaces in
> conjunction with pids.max cgroup limit which keeps (virtual) pid numbers
> below the given limit. This is for 32-bit userspace programs that may
> not work well with pid numbers above 65536.
> 
> Link: 
> https://lore.kernel.org/r/[email protected]/
> Signed-off-by: Michal Koutný <[email protected]>
> ---
>  Documentation/admin-guide/sysctl/kernel.rst |  2 ++
>  include/linux/pid_namespace.h               |  3 +++
>  kernel/pid.c                                | 12 +++++++--
>  kernel/pid_namespace.c                      | 28 +++++++++++++++------
>  4 files changed, 36 insertions(+), 9 deletions(-)
> 
> diff --git a/Documentation/admin-guide/sysctl/kernel.rst 
> b/Documentation/admin-guide/sysctl/kernel.rst
> index a43b78b4b6464..f5e68d1c8849f 100644
> --- a/Documentation/admin-guide/sysctl/kernel.rst
> +++ b/Documentation/admin-guide/sysctl/kernel.rst
> @@ -1043,6 +1043,8 @@ The last pid allocated in the current (the one task 
> using this sysctl
>  lives in) pid namespace. When selecting a pid for a next task on fork
>  kernel tries to allocate a number starting from this one.
>  
> +When set to -1, first-fit pid numbering is used instead of the next-fit.

I strongly disagree with this approach. This is way worse then making
pid_max per pid namespace.

I'm fine if you come up with something else that's purely based on
cgroups somehow and is uniform across 64-bit and 32-bit. Allowing to
change the pid allocation strategy just for 32-bit is not the solution
and not mergable.

> +
>  
>  powersave-nap (PPC only)
>  ========================
> diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
> index f9f9931e02d6a..10bf66ca78590 100644
> --- a/include/linux/pid_namespace.h
> +++ b/include/linux/pid_namespace.h
> @@ -41,6 +41,9 @@ struct pid_namespace {
>  #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
>       int memfd_noexec_scope;
>  #endif
> +#ifdef CONFIG_IA32_EMULATION
> +     bool pid_noncyclic;
> +#endif
>  } __randomize_layout;
>  
>  extern struct pid_namespace init_pid_ns;
> diff --git a/kernel/pid.c b/kernel/pid.c
> index aa2a7d4da4555..e9da1662b8821 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -191,6 +191,10 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t 
> *set_tid,
>  
>       for (i = ns->level; i >= 0; i--) {
>               int tid = 0;
> +             bool pid_noncyclic = 0;
> +#ifdef CONFIG_IA32_EMULATION
> +             pid_noncyclic = READ_ONCE(tmp->pid_noncyclic);
> +#endif
>  
>               if (set_tid_size) {
>                       tid = set_tid[ns->level - i];
> @@ -235,8 +239,12 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t 
> *set_tid,
>                        * Store a null pointer so find_pid_ns does not find
>                        * a partially initialized PID (see below).
>                        */
> -                     nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
> -                                           pid_max, GFP_ATOMIC);
> +                     if (likely(!pid_noncyclic))
> +                             nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
> +                                                   pid_max, GFP_ATOMIC);
> +                     else
> +                             nr = idr_alloc(&tmp->idr, NULL, pid_min,
> +                                                   pid_max, GFP_ATOMIC);
>               }
>               spin_unlock_irq(&pidmap_lock);
>               idr_preload_end();
> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
> index 0f23285be4f92..ceda94a064294 100644
> --- a/kernel/pid_namespace.c
> +++ b/kernel/pid_namespace.c
> @@ -113,6 +113,9 @@ static struct pid_namespace *create_pid_namespace(struct 
> user_namespace *user_ns
>       ns->pid_allocated = PIDNS_ADDING;
>  #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
>       ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
> +#endif
> +#ifdef CONFIG_IA32_EMULATION
> +     ns->pid_noncyclic = READ_ONCE(parent_pid_ns->pid_noncyclic);
>  #endif
>       return ns;
>  
> @@ -260,7 +263,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
>       return;
>  }
>  
> -#ifdef CONFIG_CHECKPOINT_RESTORE
> +#if defined(CONFIG_CHECKPOINT_RESTORE) || defined(CONFIG_IA32_EMULATION)
>  static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
>               void *buffer, size_t *lenp, loff_t *ppos)
>  {
> @@ -271,12 +274,23 @@ static int pid_ns_ctl_handler(const struct ctl_table 
> *table, int write,
>       if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns))
>               return -EPERM;
>  
> -     next = idr_get_cursor(&pid_ns->idr) - 1;
> +     next = -1;
> +#ifdef CONFIG_IA32_EMULATION
> +     if (!pid_ns->pid_noncyclic)
> +#endif
> +             next += idr_get_cursor(&pid_ns->idr);
>  
>       tmp.data = &next;
>       ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
> -     if (!ret && write)
> -             idr_set_cursor(&pid_ns->idr, next + 1);
> +     if (!ret && write) {
> +             if (next > -1)
> +                     idr_set_cursor(&pid_ns->idr, next + 1);
> +             else if (!IS_ENABLED(CONFIG_IA32_EMULATION))
> +                     ret = -EINVAL;
> +#ifdef CONFIG_IA32_EMULATION
> +             WRITE_ONCE(pid_ns->pid_noncyclic, next == -1);
> +#endif
> +     }
>  
>       return ret;
>  }
> @@ -288,11 +302,11 @@ static const struct ctl_table pid_ns_ctl_table[] = {
>               .maxlen = sizeof(int),
>               .mode = 0666, /* permissions are checked in the handler */
>               .proc_handler = pid_ns_ctl_handler,
> -             .extra1 = SYSCTL_ZERO,
> +             .extra1 = SYSCTL_NEG_ONE,
>               .extra2 = &pid_max,
>       },
>  };
> -#endif       /* CONFIG_CHECKPOINT_RESTORE */
> +#endif       /* CONFIG_CHECKPOINT_RESTORE || CONFIG_IA32_EMULATION */
>  
>  int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
>  {
> @@ -449,7 +463,7 @@ static __init int pid_namespaces_init(void)
>  {
>       pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT);
>  
> -#ifdef CONFIG_CHECKPOINT_RESTORE
> +#if defined(CONFIG_CHECKPOINT_RESTORE) || defined(CONFIG_IA32_EMULATION)
>       register_sysctl_init("kernel", pid_ns_ctl_table);
>  #endif
>  
> -- 
> 2.48.1
> 

Reply via email to