On Fri, Oct 12, 2018 at 05:33:35PM -0700, Enke Chen wrote:
> For simplicity and consistency, this patch provides an implementation
> for signal-based fault notification prior to the coredump of a child
> process. A new prctl command, PR_SET_PREDUMP_SIG, is defined that can
> be used by an application to express its interest and to specify the
> signal (SIGCHLD or SIGUSR1 or SIGUSR2) for such a notification. A new
> signal code (si_code), CLD_PREDUMP, is also defined for SIGCHLD.
> 
> Background:
> 
> As the coredump of a process may take time, in certain time-sensitive
> applications it is necessary for a parent process (e.g., a process
> manager) to be notified of a child's imminent death before the coredump
> so that the parent process can act sooner, such as re-spawning an
> application process, or initiating a control-plane fail-over.
> 
> Currently there are two ways for a parent process to be notified of a
> child process's state change. One is to use the POSIX signal, and
> another is to use the kernel connector module. The specific events and
> actions are summarized as follows:
> 
> Process Event    POSIX Signal                Connector-based
> ----------------------------------------------------------------------
> ptrace_attach()  do_notify_parent_cldstop()  proc_ptrace_connector()
>                  SIGCHLD / CLD_STOPPED
> 
> ptrace_detach()  do_notify_parent_cldstop()  proc_ptrace_connector()
>                  SIGCHLD / CLD_CONTINUED
> 
> pre_coredump/    N/A                         proc_coredump_connector()
> get_signal()
> 
> post_coredump/   do_notify_parent()          proc_exit_connector()
> do_exit()        SIGCHLD / exit_signal
> ----------------------------------------------------------------------
> 
> As shown in the table, the signal-based pre-coredump notification is not
> currently available. In some cases using a connector-based notification
> can be quite complicated (e.g., when a process manager is written in shell
> scripts and thus is subject to certain inherent limitations), and a
> signal-based notification would be simpler and better suited.
> 
> Signed-off-by: Enke Chen <enkec...@cisco.com>
> ---
>  arch/x86/kernel/signal_compat.c    |  2 +-
>  include/linux/sched.h              |  4 ++
>  include/linux/signal.h             |  5 +++
>  include/uapi/asm-generic/siginfo.h |  3 +-
>  include/uapi/linux/prctl.h         |  4 ++
>  kernel/fork.c                      |  1 +
>  kernel/signal.c                    | 51 +++++++++++++++++++++++++
>  kernel/sys.c                       | 77 
> ++++++++++++++++++++++++++++++++++++++
>  8 files changed, 145 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
> index 9ccbf05..a3deba8 100644
> --- a/arch/x86/kernel/signal_compat.c
> +++ b/arch/x86/kernel/signal_compat.c
> @@ -30,7 +30,7 @@ static inline void signal_compat_build_tests(void)
>       BUILD_BUG_ON(NSIGSEGV != 7);
>       BUILD_BUG_ON(NSIGBUS  != 5);
>       BUILD_BUG_ON(NSIGTRAP != 5);
> -     BUILD_BUG_ON(NSIGCHLD != 6);
> +     BUILD_BUG_ON(NSIGCHLD != 7);
>       BUILD_BUG_ON(NSIGSYS  != 1);
>  
>       /* This is part of the ABI and can never change in size: */
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 09026ea..cfb9645 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -696,6 +696,10 @@ struct task_struct {
>       int                             exit_signal;
>       /* The signal sent when the parent dies: */
>       int                             pdeath_signal;
> +
> +     /* The signal sent prior to a child's coredump: */
> +     int                             predump_signal;
> +
>       /* JOBCTL_*, siglock protected: */
>       unsigned long                   jobctl;
>  
> diff --git a/include/linux/signal.h b/include/linux/signal.h
> index 706a499..7cb976d 100644
> --- a/include/linux/signal.h
> +++ b/include/linux/signal.h
> @@ -256,6 +256,11 @@ static inline int valid_signal(unsigned long sig)
>       return sig <= _NSIG ? 1 : 0;
>  }
>  
> +static inline int valid_predump_signal(int sig)
> +{
> +     return (sig == SIGCHLD) || (sig == SIGUSR1) || (sig == SIGUSR2);
> +}
> +
>  struct timespec;
>  struct pt_regs;
>  enum pid_type;
> diff --git a/include/uapi/asm-generic/siginfo.h 
> b/include/uapi/asm-generic/siginfo.h
> index cb3d6c2..1a47cef 100644
> --- a/include/uapi/asm-generic/siginfo.h
> +++ b/include/uapi/asm-generic/siginfo.h
> @@ -267,7 +267,8 @@ struct {                          \
>  #define CLD_TRAPPED  4       /* traced child has trapped */
>  #define CLD_STOPPED  5       /* child has stopped */
>  #define CLD_CONTINUED        6       /* stopped child has continued */
> -#define NSIGCHLD     6
> +#define CLD_PREDUMP  7       /* child is about to dump core */
> +#define NSIGCHLD     7
>  
>  /*
>   * SIGPOLL (or any other signal without signal specific si_codes) si_codes
> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
> index c0d7ea0..79f0a8a 100644
> --- a/include/uapi/linux/prctl.h
> +++ b/include/uapi/linux/prctl.h
> @@ -219,4 +219,8 @@ struct prctl_mm_map {
>  # define PR_SPEC_DISABLE             (1UL << 2)
>  # define PR_SPEC_FORCE_DISABLE               (1UL << 3)
>  
> +/* Whether to receive signal prior to child's coredump */
> +#define PR_SET_PREDUMP_SIG   54
> +#define PR_GET_PREDUMP_SIG   55
> +
>  #endif /* _LINUX_PRCTL_H */
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 07cddff..c296c11 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1985,6 +1985,7 @@ static __latent_entropy struct task_struct 
> *copy_process(
>       p->dirty_paused_when = 0;
>  
>       p->pdeath_signal = 0;
> +     p->predump_signal = 0;
>       INIT_LIST_HEAD(&p->thread_group);
>       p->task_works = NULL;
>  
> diff --git a/kernel/signal.c b/kernel/signal.c
> index 312b43e..eb4a483 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -2337,6 +2337,44 @@ static int ptrace_signal(int signr, kernel_siginfo_t 
> *info)
>       return signr;
>  }
>  
> +/*
> + * Let the parent, if so desired, know about the imminent death of a child
> + * prior to its coredump.
> + *
> + * Locking logic is similar to do_notify_parent_cldstop().
> + */
> +static void do_notify_parent_predump(struct task_struct *tsk)
> +{
> +     struct sighand_struct *sighand;
> +     struct task_struct *parent;
> +     struct kernel_siginfo info;
> +     unsigned long flags;
> +     int sig;
> +
> +     parent = tsk->real_parent;
> +     sig = parent->predump_signal;
> +
> +     /* Check again with "tasklist_lock" locked by the caller */
> +     if (!valid_predump_signal(sig))
> +             return;
> +
> +     clear_siginfo(&info);
> +     info.si_signo = sig;
> +     if (sig == SIGCHLD)
> +             info.si_code = CLD_PREDUMP;
> +
> +     rcu_read_lock();
> +     info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent));
> +     info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns),
> +                                    task_uid(tsk));
> +     rcu_read_unlock();
> +
> +     sighand = parent->sighand;
> +     spin_lock_irqsave(&sighand->siglock, flags);
> +     __group_send_sig_info(sig, &info, parent);
> +     spin_unlock_irqrestore(&sighand->siglock, flags);
> +}
> +
>  bool get_signal(struct ksignal *ksig)
>  {
>       struct sighand_struct *sighand = current->sighand;
> @@ -2497,6 +2535,19 @@ bool get_signal(struct ksignal *ksig)
>               current->flags |= PF_SIGNALED;
>  
>               if (sig_kernel_coredump(signr)) {
> +                     /*
> +                      * Notify the parent prior to the coredump if the
> +                      * parent is interested in such a notificaiton.
> +                      */
> +                     int p_sig = current->real_parent->predump_signal;
> +
> +                     if (valid_predump_signal(p_sig)) {
> +                             read_lock(&tasklist_lock);
> +                             do_notify_parent_predump(current);
> +                             read_unlock(&tasklist_lock);
> +                             cond_resched();
> +                     }
> +
>                       if (print_fatal_signals)
>                               print_fatal_signal(ksig->info.si_signo);
>                       proc_coredump_connector(current);
> diff --git a/kernel/sys.c b/kernel/sys.c
> index 123bd73..43eb250d 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -2258,6 +2258,76 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct 
> *t, unsigned long which,
>       return -EINVAL;
>  }
>  
> +static int prctl_get_predump_signal(struct task_struct *tsk, pid_t pid,
> +                                 int __user *addr)
> +{
> +     struct task_struct *p;
> +     int error;
> +
> +     /* For the current task, the common case */
> +     if (pid == 0) {
> +             put_user(tsk->predump_signal, addr);
> +             return 0;
> +     }
> +
> +     error = -ESRCH;
> +     rcu_read_lock();
> +     p = find_task_by_vpid(pid);
> +     if (p) {
> +             error = 0;
> +             put_user(p->predump_signal, addr);
> +     }
> +     rcu_read_unlock();
> +     return error;
> +}
> +
> +/*
> + * Returns true if current's euid is same as p's uid or euid,
> + * or has CAP_SYS_ADMIN.
> + *
> + * Called with rcu_read_lock, creds are safe.
> + *
> + * Adapted from set_one_prio_perm().
> + */
> +static bool set_predump_signal_perm(struct task_struct *p)
> +{
> +     const struct cred *cred = current_cred(), *pcred = __task_cred(p);
> +
> +     return uid_eq(pcred->uid, cred->euid) ||
> +            uid_eq(pcred->euid, cred->euid) ||
> +            capable(CAP_SYS_ADMIN);

So before proceeding I'd like to discuss at least two points:
- how does this interact with the dumpability of a process?
- do we need the capable(CAP_SYS_ADMIN) restriction to init_user_ns?
  Seems we could make this work per-user-ns just like
  PRCTL_SET_PDEATHSIG does?

> +}
> +
> +static int prctl_set_predump_signal(struct task_struct *tsk, pid_t pid, int 
> sig)
> +{
> +     struct task_struct *p;
> +     int error;
> +
> +     /* 0 is valid for disabling the feature */
> +     if (sig && !valid_predump_signal(sig))
> +             return -EINVAL;
> +
> +     /* For the current task, the common case */
> +     if (pid == 0) {
> +             tsk->predump_signal = sig;
> +             return 0;
> +     }
> +
> +     error = -ESRCH;
> +     rcu_read_lock();
> +     p = find_task_by_vpid(pid);
> +     if (p) {
> +             if (!set_predump_signal_perm(p))
> +                     error = -EPERM;
> +             else {
> +                     error = 0;
> +                     p->predump_signal = sig;
> +             }
> +     }
> +     rcu_read_unlock();
> +     return error;
> +}
> +
>  SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>               unsigned long, arg4, unsigned long, arg5)
>  {
> @@ -2476,6 +2546,13 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct 
> *t, unsigned long which,
>                       return -EINVAL;
>               error = arch_prctl_spec_ctrl_set(me, arg2, arg3);
>               break;
> +     case PR_SET_PREDUMP_SIG:
> +             error = prctl_set_predump_signal(me, (pid_t)arg2, (int)arg3);
> +             break;
> +     case PR_GET_PREDUMP_SIG:
> +             error = prctl_get_predump_signal(me, (pid_t)arg2,
> +                                              (int __user *)arg3);
> +             break;
>       default:
>               error = -EINVAL;
>               break;
> -- 
> 1.8.3.1

Reply via email to