The following program (simplified version of generated by syzkaller)

        #include <pthread.h>
        #include <unistd.h>
        #include <sys/ptrace.h>
        #include <stdio.h>
        #include <signal.h>

        void *thread_func(void *arg)
        {
                ptrace(PTRACE_TRACEME, 0,0,0);
                return 0;
        }

        int main(void)
        {
                pthread_t thread;

                if (fork())
                        return 0;

                while (getppid() != 1)
                        ;

                pthread_create(&thread, NULL, thread_func, NULL);
                pthread_join(thread, NULL);
                return 0;
        }

creates the unreapable zombie if /sbin/init doesn't use __WALL.

This is not a kernel bug, at least in a sense that everything works as
expected: debugger should reap a traced sub-thread before it can reap
the leader, but without __WALL/__WCLONE do_wait() ignores sub-threads.

Unfortunately, it seems that /sbin/init in most (all?) distributions
doesn't use it and we have to change the kernel to avoid the problem.

This patch just adds the "ptrace" check into eligible_child(). To some
degree this matches the "tsk->ptrace" in exit_notify(), ->exit_signal
is mostly ignored when the tracee reports to debugger.

This obviously means the user-visible change: __WCLONE and __WALL no
longer have any meaning for debugger. And I can only hope that this
won't break something.

We could make a more conservative change. Say, we can take __WCLONE
into account, or !thread_group_leader(). But it would be nice to not
complicate these historical/confusing checks.

Signed-off-by: Oleg Nesterov <[email protected]>
Reported-by: Dmitry Vyukov <[email protected]>
Cc: <[email protected]>
---
 kernel/exit.c |   29 ++++++++++++++++++++---------
 1 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index 22fcc05..819f51e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -914,17 +914,28 @@ static int eligible_pid(struct wait_opts *wo, struct 
task_struct *p)
                task_pid_type(p, wo->wo_type) == wo->wo_pid;
 }
 
-static int eligible_child(struct wait_opts *wo, struct task_struct *p)
+static int
+eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
 {
        if (!eligible_pid(wo, p))
                return 0;
-       /* Wait for all children (clone and not) if __WALL is set;
-        * otherwise, wait for clone children *only* if __WCLONE is
-        * set; otherwise, wait for non-clone children *only*.  (Note:
-        * A "clone" child here is one that reports to its parent
-        * using a signal other than SIGCHLD.) */
-       if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
-           && !(wo->wo_flags & __WALL))
+
+       /*
+        * Wait for all children (clone and not) if __WALL is set or
+        * if it is traced by us.
+        */
+       if (ptrace || (wo->wo_flags & __WALL))
+               return 1;
+
+       /*
+        * Otherwise, wait for clone children *only* if __WCLONE is set;
+        * otherwise, wait for non-clone children *only*.
+        *
+        * Note: a "clone" child here is one that reports to its parent
+        * using a signal other than SIGCHLD, or a non-leader thread which
+        * we can only see if it is traced by us.
+        */
+       if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
                return 0;
 
        return 1;
@@ -1297,7 +1308,7 @@ static int wait_consider_task(struct wait_opts *wo, int 
ptrace,
        if (unlikely(exit_state == EXIT_DEAD))
                return 0;
 
-       ret = eligible_child(wo, p);
+       ret = eligible_child(wo, ptrace, p);
        if (!ret)
                return ret;
 
-- 
1.5.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to