I think this approach will lead to madness. For one, it will make fstat randomly fail, very rarely. So if someone in userland scripts on top of it, their scripts will extremely rarely fail on them. That is horrific.
Looks like aggressive micro-locking was introduced without considering the consequences. On a average machine this was O(n) with up to approximately n=1000, and it was under a fat lock, but each operation was mostly copying some structure fields to other structure fields in a different pre-allocated array. And now you say, every step has a potential sleep. In the zeal to unlock a fairly small O(linear), every step got potentially huge. That is a big step backwards. Alexander Bluhm <[email protected]> wrote: > On Thu, Nov 26, 2020 at 04:51:23PM +0100, Marcus MERIGHI wrote: > > Starting stack trace... > > panic(ffffffff81de557b) at panic+0x11d > > kerntrap(ffff8000229c1630) at kerntrap+0x114 > > alltraps_kern_meltdown() at alltraps_kern_meltdown+0x7b > > fill_file(ffff800000ca8000,fffffd811c6a23d0,fffffd81012516f8,3,0,ffff800022735658) > > at fil6 > > sysctl_file(ffff8000229c1c58,4,b4769963000,ffff8000229c1c88,ffff8000227d8c98) > > at sysctl_f2 > > kern_sysctl(ffff8000229c1c54,5,b4769963000,ffff8000229c1c88,0,0) at > > kern_sysctl+0x1d1 > > sys_sysctl(ffff8000227d8c98,ffff8000229c1cf0,ffff8000229c1d50) at > > sys_sysctl+0x184 > > syscall(ffff8000229c1dc0) at syscall+0x389 > > Xsyscall() at Xsyscall+0x128 > > end of kernel > > This trace looks like a bug in the fstat sysctl. I guess that > NET_LOCK() sleeps and the allprocess ps_list is not protected. > > Does calling fstat in a loop trigger the same bug? > > When I tried to grab the netlock outside the process loop some years > ago I remember some other locking problems. So my new idea is to > try to get the netlock, sleep, and if we did sleep, run the loop > from the beginning. > > Does this diff fix the crash? > > I am not sure if we want to go this way. There is no guarantee > that fstat will terminate on a box with a lot of network traffic. > Anyway it would be good to know whether my fix matches the problem > you see. > > bluhm > > Index: kern/kern_rwlock.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_rwlock.c,v > retrieving revision 1.45 > diff -u -p -r1.45 kern_rwlock.c > --- kern/kern_rwlock.c 2 Mar 2020 17:07:49 -0000 1.45 > +++ kern/kern_rwlock.c 30 Nov 2020 16:42:39 -0000 > @@ -39,7 +39,7 @@ void rw_do_exit(struct rwlock *, unsigne > #define RW_SPINS 1000 > > #ifdef MULTIPROCESSOR > -#define rw_cas(p, o, n) (atomic_cas_ulong(p, o, n) != o) > +#define rw_cas(p, o, n) (atomic_cas_ulong((p), (o), (n)) != (o)) > #else > static inline int > rw_cas(volatile unsigned long *p, unsigned long o, unsigned long n) > @@ -126,6 +126,26 @@ rw_enter_write(struct rwlock *rwl) > LOP_EXCLUSIVE | LOP_NEWORDER, NULL); > WITNESS_LOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE); > } > +} > + > +int > +rw_enter_write_try(struct rwlock *rwl) > +{ > + struct proc *p = curproc; > + int error = 0; > + > + if (__predict_false(rw_cas(&rwl->rwl_owner, 0, > + RW_PROC(p) | RWLOCK_WRLOCK))) { > + error = rw_enter(rwl, RW_WRITE | RW_SLEEPFAIL); > + if (error) > + rw_cas(&rwl->rwl_owner, RW_PROC(p) | RWLOCK_WRLOCK, 0); > + } else { > + membar_enter_after_atomic(); > + WITNESS_CHECKORDER(&rwl->rwl_lock_obj, > + LOP_EXCLUSIVE | LOP_NEWORDER, NULL); > + WITNESS_LOCK(&rwl->rwl_lock_obj, LOP_EXCLUSIVE); > + } > + return error; > } > > void > Index: kern/kern_sysctl.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_sysctl.c,v > retrieving revision 1.383 > diff -u -p -r1.383 kern_sysctl.c > --- kern/kern_sysctl.c 16 Nov 2020 06:42:12 -0000 1.383 > +++ kern/kern_sysctl.c 30 Nov 2020 16:44:07 -0000 > @@ -144,7 +144,7 @@ int sysctl_audio(int *, u_int, void *, s > int sysctl_cpustats(int *, u_int, void *, size_t *, void *, size_t); > int sysctl_utc_offset(void *, size_t *, void *, size_t); > > -void fill_file(struct kinfo_file *, struct file *, struct filedesc *, int, > +int fill_file(struct kinfo_file *, struct file *, struct filedesc *, int, > struct vnode *, struct process *, struct proc *, struct socket *, int); > void fill_kproc(struct process *, struct kinfo_proc *, struct proc *, int); > > @@ -1079,7 +1079,7 @@ sysctl_rdstruct(void *oldp, size_t *oldl > } > > #ifndef SMALL_KERNEL > -void > +int > fill_file(struct kinfo_file *kf, struct file *fp, struct filedesc *fdp, > int fd, struct vnode *vp, struct process *pr, struct proc *p, > struct socket *so, int show_pointers) > @@ -1160,7 +1160,7 @@ fill_file(struct kinfo_file *kf, struct > break; > > case DTYPE_SOCKET: { > - int locked = 0; > + int error, locked = 0; > > if (so == NULL) { > so = (struct socket *)fp->f_data; > @@ -1168,7 +1168,9 @@ fill_file(struct kinfo_file *kf, struct > switch (so->so_proto->pr_domain->dom_family) { > case AF_INET: > case AF_INET6: > - NET_LOCK(); > + NET_LOCK_TRY(error); > + if (error) > + return error; > locked = 1; > break; > } > @@ -1304,6 +1306,7 @@ fill_file(struct kinfo_file *kf, struct > kf->fd_ofileflags = fdp->fd_ofileflags[fd]; > fdpunlock(fdp); > } > + return 0; > } > > /* > @@ -1344,7 +1347,10 @@ sysctl_file(int *name, u_int namelen, ch > > #define FILLIT2(fp, fdp, i, vp, pr, so) do { \ > if (buflen >= elem_size && elem_count > 0) { \ > - fill_file(kf, fp, fdp, i, vp, pr, p, so, show_pointers);\ > + error = fill_file(kf, fp, fdp, i, vp, pr, p, so, \ > + show_pointers); \ > + if (error) \ > + break; \ > error = copyout(kf, dp, outsize); \ > if (error) \ > break; \ > @@ -1400,6 +1406,7 @@ sysctl_file(int *name, u_int namelen, ch > error = EINVAL; > break; > } > + retry_bypid: > matched = 0; > LIST_FOREACH(pr, &allprocess, ps_list) { > /* > @@ -1427,12 +1434,17 @@ sysctl_file(int *name, u_int namelen, ch > continue; > FILLIT(fp, fdp, i, NULL, pr); > FRELE(fp, p); > + if (error == EAGAIN) { > + error = 0; > + goto retry_bypid; > + } > } > } > if (!matched) > error = ESRCH; > break; > case KERN_FILE_BYUID: > + retry_byuid: > LIST_FOREACH(pr, &allprocess, ps_list) { > /* > * skip system, exiting, embryonic and undead > @@ -1456,6 +1468,10 @@ sysctl_file(int *name, u_int namelen, ch > continue; > FILLIT(fp, fdp, i, NULL, pr); > FRELE(fp, p); > + if (error == EAGAIN) { > + error = 0; > + goto retry_byuid; > + } > } > } > break; > Index: sys/rwlock.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/sys/rwlock.h,v > retrieving revision 1.26 > diff -u -p -r1.26 rwlock.h > --- sys/rwlock.h 16 Jul 2019 01:40:49 -0000 1.26 > +++ sys/rwlock.h 30 Nov 2020 16:43:39 -0000 > @@ -151,6 +151,7 @@ void rw_enter_read(struct rwlock *); > void rw_enter_write(struct rwlock *); > void rw_exit_read(struct rwlock *); > void rw_exit_write(struct rwlock *); > +int rw_enter_write_try(struct rwlock *); > > #ifdef DIAGNOSTIC > void rw_assert_wrlock(struct rwlock *); > Index: sys/systm.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/sys/systm.h,v > retrieving revision 1.148 > diff -u -p -r1.148 systm.h > --- sys/systm.h 26 Aug 2020 03:29:07 -0000 1.148 > +++ sys/systm.h 30 Nov 2020 16:42:17 -0000 > @@ -333,6 +333,7 @@ extern struct rwlock netlock; > */ > #define NET_LOCK() do { rw_enter_write(&netlock); } while (0) > #define NET_UNLOCK() do { rw_exit_write(&netlock); } while (0) > +#define NET_LOCK_TRY(e) do { (e) = rw_enter_write_try(&netlock); } > while (0) > > /* > * Reader version of NET_LOCK() to be used in "softnet" thread only. >
