Hello,
On Sat, Oct 19, 2019 at 10:30:07AM +0200, Anton Lindqvist wrote:
> On Wed, Jul 31, 2019 at 06:11:33PM +0100, Stuart Henderson wrote:
> > July 29 amd64 snap. I had just tested something in a vm (not very
> > common for me) and did "halt -p" in the guest. Immediately afterwards
> > I hit this:
> >
> > uvm_fault(0xfffffd84031d4ef0, 0x8, 0, 1) -> e
> > kernel: page fault trap, code=0
> > Stopped at filt_bpfrdetach+0x43: movq 0x8(%rax),%rax
>
> I've been looking into a similar bpf panic reported by syzkaller[1],
> which looks somewhat related. The one reported by syzkaller is caused
> by issuing ioctl(SIOCIFDESTROY) on the interface which the packet filter
> is attached to. This will in turn invoke the following functions
> expressed as an inverted stacktrace:
>
> 1. bpfsdetach()
> 2. vdevgone()
> 3. VOP_REVOKE()
> 4. vop_generic_revoke()
> 5. vgonel()
> 6. vclean(DOCLOSE)
> 7. VOP_CLOSE()
> 8. bpfclose()
>
> Note that bpfclose() is called before changing the vnode type. In
> bpfclose(), the `struct bpf_d` is immediately removed from the global
> bpf_d_list list and might end up sleeping inside taskq_barrier(systq).
> Since the bpf file descriptor (fd) is still present and valid, another
> thread could perform an ioctl() on the fd only to fault since
> bpfilter_lookup() will return NULL. The vnode is not locked in this path
> either so it won't end up waiting on the ongoing vclean().
>
> Even if the `struct bpf_d' was removed from the bpf_d_list after calling
> taskq_barrier() I assume it would need to be flagged as dying or
> something similar. In order to prevent other threads from operating on
> it. After calling bpfilter_lookup(), one would need to check the current
> state and punt if dying:
>
> d = bpfilter_lookup(minor);
> if (d->bd_state == BPFD_STATE_DYING)
> return (ENXIO);
>
> I assume another potential problem would be if the same
> ioctl(SIOCIFDESTROY) path is executed while another thread is sleeping
> in bpfioctl(), the `bpfilter_lookup() != NULL' condition would need to
> checked after sleeping. Alternatively, reference counting would help
> here but the bd_state would need to be inspected after sleeping.
>
> Thoughts?
>
> [1]
> https://syzkaller.appspot.com/bug?id=0fb0ea958b706f72818cc47b182e3ac1dc5d355a
</snip>
Thanks for detailed explanation. I'd like to make sure I got the story
right.
Does it mean the syzkaller is multithreaded application, which both thread
share the same file descriptor? Like thread 1 calls close(bpf_fd),
while thread 2 does ioctl(bpf_fd, ...)? If this is true, then the story
does make sense. Looks like VFS can protect us if there are two competing
processes, but not threads. This is my understanding, not sure if it is
correct.
Another option would be to let bpfioctl() fail when no bpf descriptor is
associated with file.
--------8<---------------8<---------------8<------------------8<--------
diff --git a/sys/net/bpf.c b/sys/net/bpf.c
index 48def8a787e..184e57c1969 100644
--- a/sys/net/bpf.c
+++ b/sys/net/bpf.c
@@ -669,6 +669,9 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, int flag,
struct proc *p)
int error = 0;
d = bpfilter_lookup(minor(dev));
+ if (d == NULL)
+ return (ENXIO);
+
if (d->bd_locked && suser(p) != 0) {
/* list of allowed ioctls when locked and not root */
switch (cmd) {
--------8<---------------8<---------------8<------------------8<--------
We can also backout my commit below [1]:
$OpenBSD: bpf.c,v 1.175 2019/05/18 12:59:32 sashan Exp $
Diff below puts reference counting for bpf descriptors back. So we will be
removing bpf descriptor from the list, when last reference dies.
thanks and
regards
sashan
[1]
https://github.com/openbsd/src/commit/b50d0c1cf040666aed872208cd6f6ba609197b11#diff-7922ad1d2f6422aa72d4bacd1bf41909
--------8<---------------8<---------------8<------------------8<--------
diff --git a/sys/net/bpf.c b/sys/net/bpf.c
index 48def8a787e..e1f50af09eb 100644
--- a/sys/net/bpf.c
+++ b/sys/net/bpf.c
@@ -124,6 +124,13 @@ void bpf_resetd(struct bpf_d *);
void bpf_prog_smr(void *);
void bpf_d_smr(void *);
+/*
+ * Reference count access to descriptor buffers
+ */
+void bpf_get(struct bpf_d *);
+void bpf_put(struct bpf_d *);
+
+
struct rwlock bpf_sysctl_lk = RWLOCK_INITIALIZER("bpfsz");
int
@@ -318,11 +325,13 @@ bpf_detachd(struct bpf_d *d)
d->bd_promisc = 0;
+ bpf_get(d);
mtx_leave(&d->bd_mtx);
NET_LOCK();
error = ifpromisc(bp->bif_ifp, 0);
NET_UNLOCK();
mtx_enter(&d->bd_mtx);
+ bpf_put(d);
if (error && !(error == EINVAL || error == ENODEV ||
error == ENXIO))
@@ -371,6 +380,7 @@ bpfopen(dev_t dev, int flag, int mode, struct proc *p)
if (flag & FNONBLOCK)
bd->bd_rtout = -1;
+ bpf_get(bd);
LIST_INSERT_HEAD(&bpf_d_list, bd, bd_list);
return (0);
@@ -391,13 +401,7 @@ bpfclose(dev_t dev, int flag, int mode, struct proc *p)
bpf_wakeup(d);
LIST_REMOVE(d, bd_list);
mtx_leave(&d->bd_mtx);
-
- /*
- * Wait for the task to finish here, before proceeding to garbage
- * collection.
- */
- taskq_barrier(systq);
- smr_call(&d->bd_smr, bpf_d_smr, d);
+ bpf_put(d);
return (0);
}
@@ -431,6 +435,7 @@ bpfread(dev_t dev, struct uio *uio, int ioflag)
if (d->bd_bif == NULL)
return (ENXIO);
+ bpf_get(d);
mtx_enter(&d->bd_mtx);
/*
@@ -536,6 +541,7 @@ bpfread(dev_t dev, struct uio *uio, int ioflag)
d->bd_in_uiomove = 0;
out:
mtx_leave(&d->bd_mtx);
+ bpf_put(d);
return (error);
}
@@ -554,7 +560,9 @@ bpf_wakeup(struct bpf_d *d)
* by the KERNEL_LOCK() we have to delay the wakeup to
* another context to keep the hot path KERNEL_LOCK()-free.
*/
- task_add(systq, &d->bd_wake_task);
+ bpf_get(d);
+ if (!task_add(systq, &d->bd_wake_task))
+ bpf_put(d);
}
void
@@ -569,6 +577,7 @@ bpf_wakeup_cb(void *xd)
csignal(d->bd_pgid, d->bd_sig, d->bd_siguid, d->bd_sigeuid);
selwakeup(&d->bd_sel);
+ bpf_put(d);
}
int
@@ -586,6 +595,7 @@ bpfwrite(dev_t dev, struct uio *uio, int ioflag)
if (d->bd_bif == NULL)
return (ENXIO);
+ bpf_get(d);
ifp = d->bd_bif->bif_ifp;
if (ifp == NULL || (ifp->if_flags & IFF_UP) == 0) {
@@ -619,6 +629,7 @@ bpfwrite(dev_t dev, struct uio *uio, int ioflag)
NET_UNLOCK();
out:
+ bpf_put(d);
return (error);
}
@@ -694,6 +705,8 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, int flag,
struct proc *p)
}
}
+ bpf_get(d);
+
switch (cmd) {
default:
error = EINVAL;
@@ -991,6 +1004,7 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, int flag,
struct proc *p)
break;
}
+ bpf_put(d);
return (error);
}
@@ -1178,6 +1192,7 @@ bpfkqfilter(dev_t dev, struct knote *kn)
return (EINVAL);
}
+ bpf_get(d);
kn->kn_hook = d;
SLIST_INSERT_HEAD(klist, kn, kn_selnext);
@@ -1197,6 +1212,7 @@ filt_bpfrdetach(struct knote *kn)
KERNEL_ASSERT_LOCKED();
SLIST_REMOVE(&d->bd_sel.si_note, kn, knote, kn_selnext);
+ bpf_put(d);
}
int
@@ -1579,6 +1595,25 @@ bpf_d_smr(void *smr)
free(bd, M_DEVBUF, sizeof(*bd));
}
+void
+bpf_get(struct bpf_d *bd)
+{
+ atomic_inc_int(&bd->bd_ref);
+}
+
+/*
+ * Free buffers currently in use by a descriptor
+ * when the reference count drops to zero.
+ */
+void
+bpf_put(struct bpf_d *bd)
+{
+ if (atomic_dec_int_nv(&bd->bd_ref) > 0)
+ return;
+
+ smr_call(&bd->bd_smr, bpf_d_smr, bd);
+}
+
void *
bpfsattach(caddr_t *bpfp, const char *name, u_int dlt, u_int hdrlen)
{
diff --git a/sys/net/bpfdesc.h b/sys/net/bpfdesc.h
index 130b91c1d9f..13f1ec64b05 100644
--- a/sys/net/bpfdesc.h
+++ b/sys/net/bpfdesc.h
@@ -93,6 +93,7 @@ struct bpf_d {
pid_t bd_pgid; /* process or group id for signal */
uid_t bd_siguid; /* uid for process that set pgid */
uid_t bd_sigeuid; /* euid for process that set pgid */
+ u_int bd_ref; /* reference count */
struct selinfo bd_sel; /* bsd select info */
int bd_unit; /* logical unit number */
LIST_ENTRY(bpf_d) bd_list; /* descriptor list */