On 3/13/26 12:09, Pavel Tikhomirov wrote:
> First testcase "pidns_init_via_setns" checks that a process can become
> Pid 1 (init) in a new Pid namespace created via unshare() and joined via
> setns().
>
> Second testcase "pidns_init_via_setns_set_tid" checks that during this
> process we can use clone3() + set_tid and set the pid in both the new
> and old pid namespaces (owned by different user namespaces).
>
> Signed-off-by: Pavel Tikhomirov <[email protected]>
> ---
> .../selftests/pid_namespace/.gitignore | 1 +
> .../testing/selftests/pid_namespace/Makefile | 2 +-
> .../pid_namespace/pidns_init_via_setns.c | 238 ++++++++++++++++++
> 3 files changed, 240 insertions(+), 1 deletion(-)
> create mode 100644
> tools/testing/selftests/pid_namespace/pidns_init_via_setns.c
>
> diff --git a/tools/testing/selftests/pid_namespace/.gitignore
> b/tools/testing/selftests/pid_namespace/.gitignore
> index 5118f0f3edf4..c647c6eb3367 100644
> --- a/tools/testing/selftests/pid_namespace/.gitignore
> +++ b/tools/testing/selftests/pid_namespace/.gitignore
> @@ -1,2 +1,3 @@
> pid_max
> +pidns_init_via_setns
> regression_enomem
> diff --git a/tools/testing/selftests/pid_namespace/Makefile
> b/tools/testing/selftests/pid_namespace/Makefile
> index b972f55d07ae..b01a924ac04b 100644
> --- a/tools/testing/selftests/pid_namespace/Makefile
> +++ b/tools/testing/selftests/pid_namespace/Makefile
> @@ -1,7 +1,7 @@
> # SPDX-License-Identifier: GPL-2.0
> CFLAGS += -g $(KHDR_INCLUDES)
>
> -TEST_GEN_PROGS = regression_enomem pid_max
> +TEST_GEN_PROGS = regression_enomem pid_max pidns_init_via_setns
>
> LOCAL_HDRS += $(selfdir)/pidfd/pidfd.h
>
> diff --git a/tools/testing/selftests/pid_namespace/pidns_init_via_setns.c
> b/tools/testing/selftests/pid_namespace/pidns_init_via_setns.c
> new file mode 100644
> index 000000000000..7e4c610291d3
> --- /dev/null
> +++ b/tools/testing/selftests/pid_namespace/pidns_init_via_setns.c
> @@ -0,0 +1,238 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#define _GNU_SOURCE
> +#include <fcntl.h>
> +#include <sched.h>
> +#include <stdio.h>
> +#include <sys/types.h>
> +#include <unistd.h>
> +
> +#include "kselftest_harness.h"
> +#include "../pidfd/pidfd.h"
> +
> +/*
> + * Test that a process can become PID 1 (init) in a new PID namespace
> + * created via unshare() and joined via setns().
> + *
> + * Flow:
> + * 1. Parent creates a pipe for synchronization.
> + * 2. Parent forks a child.
> + * 3. Parent calls unshare(CLONE_NEWPID) to create a new PID namespace.
> + * 4. Parent signals the child via the pipe.
> + * 5. Child opens parent's /proc/<ppid>/ns/pid_for_children and calls
> + * setns(fd, CLONE_NEWPID) to join the new namespace.
> + * 6. Child forks a grandchild.
> + * 7. Grandchild verifies getpid() == 1.
> + */
> +TEST(pidns_init_via_setns)
> +{
> + pid_t child, parent_pid;
> + int pipe_fd[2];
> + char buf;
> +
> + parent_pid = getpid();
> +
> + ASSERT_EQ(0, pipe(pipe_fd));
> +
> + child = fork();
> + ASSERT_GE(child, 0);
> +
> + if (child == 0) {
> + char path[256];
> + int nsfd;
> + pid_t grandchild;
> +
> + close(pipe_fd[1]);
> +
> + /* Wait for parent to complete unshare */
> + ASSERT_EQ(1, read_nointr(pipe_fd[0], &buf, 1));
> + close(pipe_fd[0]);
> +
> + snprintf(path, sizeof(path),
> + "/proc/%d/ns/pid_for_children", parent_pid);
> + nsfd = open(path, O_RDONLY);
> + ASSERT_GE(nsfd, 0);
> +
> + ASSERT_EQ(0, setns(nsfd, CLONE_NEWPID));
> + close(nsfd);
> +
> + grandchild = fork();
> + ASSERT_GE(grandchild, 0);
> +
> + if (grandchild == 0) {
> + /* Should be init (PID 1) in the new namespace */
> + if (getpid() != 1)
> + _exit(1);
> + _exit(0);
> + }
> +
> + ASSERT_EQ(0, wait_for_pid(grandchild));
> + _exit(0);
> + }
> +
> + close(pipe_fd[0]);
> +
> + if (geteuid())
> + ASSERT_EQ(0, unshare(CLONE_NEWUSER));
I think it's better to move this hack above to the very top of the test.
Was found by
https://sashiko.dev/#/patchset/20260313111014.2068913-1-ptikhomirov%40virtuozzo.com
> +
> + ASSERT_EQ(0, unshare(CLONE_NEWPID));
> +
> + /* Signal child that the new PID namespace is ready */
> + buf = 0;
> + ASSERT_EQ(1, write_nointr(pipe_fd[1], &buf, 1));
> + close(pipe_fd[1]);
> +
> + ASSERT_EQ(0, wait_for_pid(child));
> +}
> +
> +/*
> + * Similar to pidns_init_via_setns, but:
> + * 1. Parent enters a new PID namespace right from the start to be able to
> + * later freely use pid 1001 in it.
> + * 2. After forking child, parent also calls unshare(CLONE_NEWUSER)
> + * before unshare(CLONE_NEWPID) so that new old and new pid namespaces
> have
> + * different user namespace owners.
> + * 3. Child uses clone3() with set_tid={1, 1001} instead of fork() and
> + * grandchild checks that it gets desired pids .
> + *
> + * Flow:
> + * 1. Test process creates a new PID namespace and forks a wrapper
> + * (PID 1 in the outer namespace).
> + * 2. Wrapper forks a child.
> + * 3. Wrapper calls unshare(CLONE_NEWUSER) + unshare(CLONE_NEWPID)
> + * to create an inner PID namespace.
> + * 4. Wrapper signals the child via pipe.
> + * 5. Child opens wrapper's /proc/<pid>/ns/pid_for_children and calls
> + * setns(fd, CLONE_NEWPID) to join the inner namespace.
> + * 6. Child calls clone3() with set_tid={1, 1001}.
> + * 7. Grandchild verifies its NSpid ends with "1001 1".
> + */
> +
> +pid_t set_tid[] = {1, 1001};
> +
> +static int pidns_init_via_setns_set_tid_grandchild(struct __test_metadata
> *_metadata)
> +{
> + char *line = NULL;
> + size_t len = 0;
> + int found = 0;
> + FILE *gf;
> +
> + gf = fopen("/proc/self/status", "r");
> + ASSERT_NE(gf, NULL);
> +
> + while (getline(&line, &len, gf) != -1) {
> + if (strncmp(line, "NSpid:", 6) != 0)
> + continue;
> +
> + for (int i = 0; i < 2; i++) {
> + char *last = strrchr(line, '\t');
> + pid_t pid;
> +
> + ASSERT_NE(last, NULL);
> + ASSERT_EQ(sscanf(last, "%d", &pid), 1);
> + ASSERT_EQ(pid, set_tid[i]);
> + *last = '\0';
> + }
> +
> + found = true;
> + break;
> + }
> +
> + free(line);
> + fclose(gf);
> + ASSERT_TRUE(found);
> + return 0;
> +}
> +
> +static int pidns_init_via_setns_set_tid_child(struct __test_metadata
> *_metadata,
> + pid_t parent_pid, int pipe_fd[2])
> +{
> + struct __clone_args args = {
> + .exit_signal = SIGCHLD,
> + .set_tid = ptr_to_u64(set_tid),
> + .set_tid_size = 2,
> + };
> + pid_t grandchild;
> + char path[256];
> + char buf;
> + int nsfd;
> +
> + close(pipe_fd[1]);
> +
> + ASSERT_EQ(1, read_nointr(pipe_fd[0], &buf, 1));
> + close(pipe_fd[0]);
> +
> + snprintf(path, sizeof(path),
> + "/proc/%d/ns/pid_for_children", parent_pid);
> + nsfd = open(path, O_RDONLY);
> + ASSERT_GE(nsfd, 0);
> +
> + ASSERT_EQ(0, setns(nsfd, CLONE_NEWPID));
> + close(nsfd);
> +
> + grandchild = sys_clone3(&args, sizeof(args));
> + ASSERT_GE(grandchild, 0);
> +
> + if (grandchild == 0)
> + _exit(pidns_init_via_setns_set_tid_grandchild(_metadata));
> +
> + ASSERT_EQ(0, wait_for_pid(grandchild));
> + return 0;
> +}
> +
> +static int pidns_init_via_setns_set_tid_wrapper(struct __test_metadata
> *_metadata)
> +{
> + int pipe_fd[2];
> + pid_t child, parent_pid;
> + char buf;
> + FILE *f;
> +
> + /*
> + * We are PID 1 inside the new namespace, but /proc is
> + * mounted from the host. Read our host-visible PID so
> + * the child can reach our pid_for_children via /proc.
> + */
> + f = fopen("/proc/self/stat", "r");
> + ASSERT_NE(f, NULL);
> + ASSERT_EQ(fscanf(f, "%d", &parent_pid), 1);
> + ASSERT_EQ(0, pipe(pipe_fd));
> +
> + child = fork();
> + ASSERT_GE(child, 0);
> +
> + if (child == 0)
> + _exit(pidns_init_via_setns_set_tid_child(_metadata, parent_pid,
> pipe_fd));
> +
> + close(pipe_fd[0]);
> +
> + ASSERT_EQ(0, unshare(CLONE_NEWUSER));
> + ASSERT_EQ(0, unshare(CLONE_NEWPID));
> +
> + buf = 0;
> + ASSERT_EQ(1, write_nointr(pipe_fd[1], &buf, 1));
> + close(pipe_fd[1]);
> +
> + ASSERT_EQ(0, wait_for_pid(child));
> +
> + fclose(f);
> + return 0;
> +}
> +
> +TEST(pidns_init_via_setns_set_tid)
> +{
> + pid_t wrapper;
> +
> + if (geteuid())
> + ASSERT_EQ(0, unshare(CLONE_NEWUSER));
I found that the above hack is not exactly working in this test as later
unshare(CLONE_NEWUSER) in the test will fail with EPERM. To overcome this the
setup of uid/gid maps is needed.
Maybe there is some alternative way to mark this test only run from root
instead of makeing a quite complex fallback for unprivileged case?
> +
> + ASSERT_EQ(0, unshare(CLONE_NEWPID));
> +
> + wrapper = fork();
> + ASSERT_GE(wrapper, 0);
> +
> + if (wrapper == 0)
> + _exit(pidns_init_via_setns_set_tid_wrapper(_metadata));
> +
> + ASSERT_EQ(0, wait_for_pid(wrapper));
> +}
> +
> +TEST_HARNESS_MAIN
--
Best regards, Pavel Tikhomirov
Senior Software Developer, Virtuozzo.