On Fri, Mar 02, 2018 at 02:18:46PM +0100, Hiltjo Posthuma wrote:
> On Wed, Feb 21, 2018 at 02:15:07PM +0100, Martin Pieuchot wrote:
> > Diff below fixes a race in signal handling that can be triggered when
> > a multi-threaded process installs a signal handler with SA_RESETHAND.
> >
> > I described a case of this problem on bugs@ a month ago:
> > https://marc.info/?l=openbsd-bugs&m=151672599508049&w=2
> >
> > The race occurs when one of the threads is doing a NOLOCK syscall, like
> > futex(2):
> >
> > CPU0:
> > Before returning to userland this thread will call CURSIG() without
> > KERNEL_LOCK(), issignal() will look at `ps_sigact', see that there's
> > a handler and return the signal number. After that the thread will
> > call postsig() and spin on the KERNEL_LOCK().
> >
> > CPU1:
> > In the meantime another thread, with the KERNEL_LOCK(), also looked at
> > `ps_sigact' and is already processing the signal. Since the handler
> > has been installed with SA_RESETHAND it will reset the action to
> > SIG_DFL then release the KERNEL_LOCK().
> >
> > CPU0:
> > After grabbing the KERNEL_LOCK() inside postsig(), the thread tries
> > to process an already processed signal, see that the actions is
> > SIG_DFL and call sigexit() while all other threads are SSTOP'ed.
> >
> >
> > Diff below fixes that by serializing changes to `ps_sigact' via the
> > KERNEL_LOCK(). I also introduce postsig_done(), gypped from FreeBSD,
> > to assert that the lock is held and to prepare for finer grained
> > locking.
> >
> > Tests, oks?
> >
> > Index: kern/kern_sig.c
> > ===================================================================
> > RCS file: /cvs/src/sys/kern/kern_sig.c,v
> > retrieving revision 1.215
> > diff -u -p -r1.215 kern_sig.c
> > --- kern/kern_sig.c 19 Feb 2018 08:59:52 -0000 1.215
> > +++ kern/kern_sig.c 21 Feb 2018 13:09:54 -0000
> > @@ -82,6 +82,7 @@ void proc_stop(struct proc *p, int);
> > void proc_stop_sweep(void *);
> > struct timeout proc_stop_to;
> >
> > +void postsig(struct proc *, int);
> > int cansignal(struct proc *, struct process *, int);
> >
> > struct pool sigacts_pool; /* memory pool for sigacts structures */
> > @@ -747,6 +748,27 @@ pgsignal(struct pgrp *pgrp, int signum,
> > }
> >
> > /*
> > + * Recalculate the signal mask and reset the signal disposition after
> > + * usermode frame for delivery is formed.
> > + */
> > +void
> > +postsig_done(struct proc *p, int signum, struct sigacts *ps)
> > +{
> > + int mask = sigmask(signum);
> > +
> > + KERNEL_ASSERT_LOCKED();
> > +
> > + p->p_ru.ru_nsignals++;
> > + atomic_setbits_int(&p->p_sigmask, ps->ps_catchmask[signum]);
> > + if ((ps->ps_sigreset & mask) != 0) {
> > + ps->ps_sigcatch &= ~mask;
> > + if (signum != SIGCONT && sigprop[signum] & SA_IGNORE)
> > + ps->ps_sigignore |= mask;
> > + ps->ps_sigact[signum] = SIG_DFL;
> > + }
> > +}
> > +
> > +/*
> > * Send a signal caused by a trap to the current thread
> > * If it will be caught immediately, deliver it with correct code.
> > * Otherwise, post it normally.
> > @@ -780,16 +802,9 @@ trapsignal(struct proc *p, int signum, u
> > p->p_sigmask, code, &si);
> > }
> > #endif
> > - p->p_ru.ru_nsignals++;
> > (*pr->ps_emul->e_sendsig)(ps->ps_sigact[signum], signum,
> > p->p_sigmask, trapno, code, sigval);
> > - atomic_setbits_int(&p->p_sigmask, ps->ps_catchmask[signum]);
> > - if ((ps->ps_sigreset & mask) != 0) {
> > - ps->ps_sigcatch &= ~mask;
> > - if (signum != SIGCONT && sigprop[signum] & SA_IGNORE)
> > - ps->ps_sigignore |= mask;
> > - ps->ps_sigact[signum] = SIG_DFL;
> > - }
> > + postsig_done(p, signum, ps);
> > } else {
> > p->p_sisig = signum;
> > p->p_sitrapno = trapno; /* XXX for core dump/debugger */
> > @@ -1330,9 +1345,8 @@ proc_stop_sweep(void *v)
> > * from the current set of pending signals.
> > */
> > void
> > -postsig(int signum)
> > +postsig(struct proc *p, int signum)
> > {
> > - struct proc *p = curproc;
> > struct process *pr = p->p_p;
> > struct sigacts *ps = pr->ps_sigacts;
> > sig_t action;
> > @@ -1341,12 +1355,8 @@ postsig(int signum)
> > union sigval sigval;
> > int s, code;
> >
> > -#ifdef DIAGNOSTIC
> > - if (signum == 0)
> > - panic("postsig");
> > -#endif
> > -
> > - KERNEL_LOCK();
> > + KASSERT(signum != 0);
> > + KERNEL_ASSERT_LOCKED();
> >
> > mask = sigmask(signum);
> > atomic_clearbits_int(&p->p_siglist, mask);
> > @@ -1366,7 +1376,7 @@ postsig(int signum)
> > #ifdef KTRACE
> > if (KTRPOINT(p, KTR_PSIG)) {
> > siginfo_t si;
> > -
> > +
> > initsiginfo(&si, signum, trapno, code, sigval);
> > ktrpsig(p, signum, action, p->p_flag & P_SIGSUSPEND ?
> > p->p_oldmask : p->p_sigmask, code, &si);
> > @@ -1407,15 +1417,6 @@ postsig(int signum)
> > } else {
> > returnmask = p->p_sigmask;
> > }
> > - atomic_setbits_int(&p->p_sigmask, ps->ps_catchmask[signum]);
> > - if ((ps->ps_sigreset & mask) != 0) {
> > - ps->ps_sigcatch &= ~mask;
> > - if (signum != SIGCONT && sigprop[signum] & SA_IGNORE)
> > - ps->ps_sigignore |= mask;
> > - ps->ps_sigact[signum] = SIG_DFL;
> > - }
> > - splx(s);
> > - p->p_ru.ru_nsignals++;
> > if (p->p_sisig == signum) {
> > p->p_sisig = 0;
> > p->p_sitrapno = 0;
> > @@ -1425,9 +1426,9 @@ postsig(int signum)
> >
> > (*pr->ps_emul->e_sendsig)(action, signum, returnmask, trapno,
> > code, sigval);
> > + postsig_done(p, signum, ps);
> > + splx(s);
> > }
> > -
> > - KERNEL_UNLOCK();
> > }
> >
> > /*
> > @@ -1816,7 +1817,7 @@ filt_signal(struct knote *kn, long hint)
> > void
> > userret(struct proc *p)
> > {
> > - int sig;
> > + int signum;
> >
> > /* send SIGPROF or SIGVTALRM if their timers interrupted this thread */
> > if (p->p_flag & P_PROFPEND) {
> > @@ -1832,8 +1833,12 @@ userret(struct proc *p)
> > KERNEL_UNLOCK();
> > }
> >
> > - while ((sig = CURSIG(p)) != 0)
> > - postsig(sig);
> > + if (CURSIG(p) != 0) {
> > + KERNEL_LOCK();
> > + while ((signum = CURSIG(p)) != 0)
> > + postsig(p, signum);
> > + KERNEL_UNLOCK();
> > + }
> >
> > /*
> > * If P_SIGSUSPEND is still set here, then we still need to restore
> > @@ -1845,8 +1850,10 @@ userret(struct proc *p)
> > atomic_clearbits_int(&p->p_flag, P_SIGSUSPEND);
> > p->p_sigmask = p->p_oldmask;
> >
> > - while ((sig = CURSIG(p)) != 0)
> > - postsig(sig);
> > + KERNEL_LOCK();
> > + while ((signum = CURSIG(p)) != 0)
> > + postsig(p, signum);
> > + KERNEL_UNLOCK();
> > }
> >
> > if (p->p_flag & P_SUSPSINGLE) {
> > Index: sys/signalvar.h
> > ===================================================================
> > RCS file: /cvs/src/sys/sys/signalvar.h,v
> > retrieving revision 1.28
> > diff -u -p -r1.28 signalvar.h
> > --- sys/signalvar.h 5 May 2015 02:13:46 -0000 1.28
> > +++ sys/signalvar.h 21 Feb 2018 13:04:15 -0000
> > @@ -151,7 +151,6 @@ void gsignal(int pgid, int sig);
> > void csignal(pid_t pgid, int signum, uid_t uid, uid_t euid);
> > int issignal(struct proc *p);
> > void pgsignal(struct pgrp *pgrp, int sig, int checkctty);
> > -void postsig(int sig);
> > void psignal(struct proc *p, int sig);
> > void ptsignal(struct proc *p, int sig, enum signal_type type);
> > #define prsignal(pr,sig) ptsignal((pr)->ps_mainproc, (sig), SPROCESS)
> >
>
> Hi,
>
> I tried your patch on a source build of about 25 februari on amd64. I still
> notice mpv hanging when I try to close it or when the video ends.
>
> Let me know how/if I can help you further with testing.
>
Updated to source of ~4 March:
OpenBSD ren.laptop 6.3 GENERIC.MP#0 amd64
When I do a backtrace on the hanged mpv I get:
$ gdb mpv pid
$ bt
#0 _thread_sys___thrsleep () at -:3
#1 0x0000117420965ea4 in _sem_wait (sem=0x1174219ffc40, tryonly=564132932,
abstime=0x0,
delayed_cancel=0x11738fa402f0) at /usr/src/lib/librthread/rthread_sem.c:74
#2 0x00001174209650bf in pthread_join (thread=0x1174219ffc40, retval=0x0)
at /usr/src/lib/librthread/rthread.c:304
#3 0x00001173ec0a9d5b in SDL_WaitThread_REAL () from
/usr/local/lib/libSDL2.so.0.4
#4 0x00001173ec05b64a in close_audio_device () from
/usr/local/lib/libSDL2.so.0.4
#5 0x00001173ec05a1e2 in SDL_AudioQuit_REAL () from
/usr/local/lib/libSDL2.so.0.4
#6 0x00001173ec05703b in SDL_QuitSubSystem_REAL () from
/usr/local/lib/libSDL2.so.0.4
#7 0x000011713bf135e9 in ao_uninit (ao=0x11740b004740) at ../audio/out/ao.c:336
#8 0x000011713bf561fe in uninit_audio_out (mpctx=0x1173a2d8b840) at
../player/audio.c:280
#9 0x000011713bf6efd2 in mp_destroy (mpctx=0x1173a2d8b840) at
../player/main.c:166
#10 0x000011713bf6fbe1 in mpv_main (argc=Variable "argc" is not available.
) at ../player/main.c:243
#11 0x000011713bf03156 in _start () from /usr/local/bin/mpv
#12 0x0000000000000000 in ?? ()
(gdb)
I hope this helps in some way (no patch, sorry :)),
--
Kind regards,
Hiltjo