On Tue, Mar 05, 2019 at 12:03:05PM +1000, David Gwynne wrote: > this extends the fildrop mechanism so you can drop the packets with bpf > using the existing fildrop method, but with an extra tweak so you can > avoid the cost of copying packets to userland. > > i wanted to quickly drop some packets in the rx interrupt path to try > and prioritise some traffic getting processed by the system. the initial > version was going to use weird custom DLTs and extra bpf interface > pointers and stuff, but most of the glue is already in place with > the fildrop functionality. > > this also adds a bit to tcpdump so you can set a fildrop action. it > means tcpdump can be used as a quick and dirty firewall.
there's a bit more discussion about this that i should have included in my original email. firstly, the functionality it offers. this effectively offers a firewall with the ability to filter arbitrary packets. this has significant overlap with the functionality that pf offers, but there are a couple of important differences. pf only handles IP traffic, but we don't really have a good story when it comes to filtering non-ip. we could implement something like pf for the next protocol that people need to manage, but what is that next protocol? pf like implies a highly optimised but constrained set of filters that deeply understands the protocol it is handling. is that next protol ieee1905p? cdp? ipx? macsec? where should that protocol be filtered in the stack? im arguing that bpf with fildrop has the benefit of already existing, it's in place, and it already has the ability to be configured with arbitrary policy. considering we've got this far without handling non-ip, spending more time on it seems unjustified. secondly, the performance aspects of this diff. bpf allows for arbitrarily complicated filters, so it is entirely possible to slow your box down a lot by writing really complicated filters. this is in comparison to pf where each rule has a limit on how much work it will do, which is also mitigated by the ruleset optimiser and skip steps. i don't have a good answer to that except to say you can already add such filters to bpf, they just don't do anything except copy packets at the moment. another interesting performance consideration is that bpf runs a lot earlier than pf, so filtering packets with bpf can avoid a lot of work in the stack. if you want to pass IP statefully, pf is a much better hammer, but to drop packets up front bpf is interesting. for example, thanks to hrvoje popovski i now have a setup where im pushing ~7 million packets per second through a box to do performance measurements. those packets are udp from random ips to port 7 on another set of random ips. if i have the following rule in pf.conf: block in quick proto udp to port 7 i can rx and drop about 550kpps. if im sshed in using another interface, the system is super sluggish over that shell. if i use this diff and run the following; # tcpdump -B drop -i ix1 udp and port 7 i'm dropping about 1.2 million pps, and the box is responsive when sshed in using another interface. so, to summarise, bpf can already be used to drop packets, this is just a tweak to make it faster, and a tweak so tcpdump can be used to set up that filtering. > Index: sys/net/bpf.c > =================================================================== > RCS file: /cvs/src/sys/net/bpf.c,v > retrieving revision 1.170 > diff -u -p -r1.170 bpf.c > --- sys/net/bpf.c 13 Jul 2018 08:51:15 -0000 1.170 > +++ sys/net/bpf.c 4 Mar 2019 22:30:32 -0000 > @@ -926,9 +926,20 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t > *(u_int *)addr = d->bd_fildrop; > break; > > - case BIOCSFILDROP: /* set "filter-drop" flag */ > - d->bd_fildrop = *(u_int *)addr ? 1 : 0; > + case BIOCSFILDROP: { /* set "filter-drop" flag */ > + unsigned int fildrop = *(u_int *)addr; > + switch (fildrop) { > + case BPF_FILDROP_PASS: > + case BPF_FILDROP_CAPTURE: > + case BPF_FILDROP_DROP: > + d->bd_fildrop = fildrop; > + break; > + default: > + error = EINVAL; > + break; > + } > break; > + } > > case BIOCGDIRFILT: /* get direction filter */ > *(u_int *)addr = d->bd_dirfilt; > @@ -1261,23 +1272,26 @@ _bpf_mtap(caddr_t arg, const struct mbuf > pktlen += m0->m_len; > > SRPL_FOREACH(d, &sr, &bp->bif_dlist, bd_next) { > + struct srp_ref bsr; > + struct bpf_program *bf; > + struct bpf_insn *fcode = NULL; > + > atomic_inc_long(&d->bd_rcount); > > - if ((direction & d->bd_dirfilt) != 0) > - slen = 0; > - else { > - struct srp_ref bsr; > - struct bpf_program *bf; > - struct bpf_insn *fcode = NULL; > - > - bf = srp_enter(&bsr, &d->bd_rfilter); > - if (bf != NULL) > - fcode = bf->bf_insns; > - slen = bpf_mfilter(fcode, m, pktlen); > - srp_leave(&bsr); > - } > + if (ISSET(d->bd_dirfilt, direction)) > + continue; > + > + bf = srp_enter(&bsr, &d->bd_rfilter); > + if (bf != NULL) > + fcode = bf->bf_insns; > + slen = bpf_mfilter(fcode, m, pktlen); > + srp_leave(&bsr); > > - if (slen > 0) { > + if (slen == 0) > + continue; > + if (d->bd_fildrop != BPF_FILDROP_PASS) > + drop = 1; > + if (d->bd_fildrop != BPF_FILDROP_DROP) { > if (!gottime++) > microtime(&tv); > > @@ -1285,9 +1299,6 @@ _bpf_mtap(caddr_t arg, const struct mbuf > bpf_catchpacket(d, (u_char *)m, pktlen, slen, cpfn, > &tv); > mtx_leave(&d->bd_mtx); > - > - if (d->bd_fildrop) > - drop = 1; > } > } > SRPL_LEAVE(&sr); > Index: sys/net/bpf.h > =================================================================== > RCS file: /cvs/src/sys/net/bpf.h,v > retrieving revision 1.65 > diff -u -p -r1.65 bpf.h > --- sys/net/bpf.h 3 Feb 2018 13:37:37 -0000 1.65 > +++ sys/net/bpf.h 4 Mar 2019 22:30:32 -0000 > @@ -126,6 +126,13 @@ struct bpf_version { > #define BPF_DIRECTION_IN 1 > #define BPF_DIRECTION_OUT (1<<1) > > +/* > + * Values for BIOCGFILDROP/BIOCSFILDROP > + */ > +#define BPF_FILDROP_PASS 0 /* capture, pass */ > +#define BPF_FILDROP_CAPTURE 1 /* capture, drop */ > +#define BPF_FILDROP_DROP 2 /* no capture, drop */ > + > struct bpf_timeval { > u_int32_t tv_sec; > u_int32_t tv_usec; > Index: share/man/man4/bpf.4 > =================================================================== > RCS file: /cvs/src/share/man/man4/bpf.4,v > retrieving revision 1.38 > diff -u -p -r1.38 bpf.4 > --- share/man/man4/bpf.4 28 Apr 2016 19:07:19 -0000 1.38 > +++ share/man/man4/bpf.4 4 Mar 2019 22:30:32 -0000 > @@ -391,11 +391,24 @@ This flag is initialized to zero by defa > .Pp > .It Dv BIOCSFILDROP Fa "u_int *" > .It Dv BIOCGFILDROP Fa "u_int *" > -Sets or gets the status of the > +Sets or gets the > .Dq filter drop > -flag. > -If non-zero, packets matching any filters will be reported to the > -associated interface so that they can be dropped. > +action. > +The supported actions for packets matching the filter are: > +.Pp > +.Bl -tag -width "BPF_FILDROP_CAPTURE" -compact > +.It Dv BPF_FILDROP_PASS > +Accept and capture > +.It Dv BPF_FILDROP_CAPTURE > +Drop and capture > +.It Dv BPF_FILDROP_DROP > +Drop and do not capture > +.El > +.Pp > +Packets matching any filter configured to drop packets will be > +reported to the associated interface so that they can be dropped. > +The default action is > +.Dv BPF_FILDROP_PASS . > .Pp > .It Dv BIOCSDIRFILT Fa "u_int *" > .It Dv BIOCGDIRFILT Fa "u_int *" > Index: usr.sbin/tcpdump/privsep.c > =================================================================== > RCS file: /cvs/src/usr.sbin/tcpdump/privsep.c,v > retrieving revision 1.52 > diff -u -p -r1.52 privsep.c > --- usr.sbin/tcpdump/privsep.c 17 Nov 2018 16:52:02 -0000 1.52 > +++ usr.sbin/tcpdump/privsep.c 4 Mar 2019 22:30:32 -0000 > @@ -224,7 +224,7 @@ priv_exec(int argc, char *argv[]) > /* parse the arguments for required options */ > opterr = 0; > while ((i = getopt(argc, argv, > - "ac:D:deE:fF:i:lLnNOopPqr:s:StT:vw:xXy:Y")) != -1) { > + "aB:c:D:deE:fF:i:lLnNOopPqr:s:StT:vw:xXy:Y")) != -1) { > switch (i) { > case 'n': > nflag++; > @@ -366,7 +366,7 @@ static void > impl_open_bpf(int fd, int *bpfd) > { > int snaplen, promisc, err; > - u_int dlt, dirfilt; > + u_int dlt, dirfilt, fildrop; > char device[IFNAMSIZ]; > size_t iflen; > > @@ -376,10 +376,11 @@ impl_open_bpf(int fd, int *bpfd) > must_read(fd, &promisc, sizeof(int)); > must_read(fd, &dlt, sizeof(u_int)); > must_read(fd, &dirfilt, sizeof(u_int)); > + must_read(fd, &fildrop, sizeof(fildrop)); > iflen = read_string(fd, device, sizeof(device), __func__); > if (iflen == 0) > errx(1, "Invalid interface size specified"); > - *bpfd = pcap_live(device, snaplen, promisc, dlt, dirfilt); > + *bpfd = pcap_live(device, snaplen, promisc, dlt, dirfilt, fildrop); > err = errno; > if (*bpfd < 0) > logmsg(LOG_DEBUG, > Index: usr.sbin/tcpdump/privsep.h > =================================================================== > RCS file: /cvs/src/usr.sbin/tcpdump/privsep.h,v > retrieving revision 1.11 > diff -u -p -r1.11 privsep.h > --- usr.sbin/tcpdump/privsep.h 8 Nov 2018 14:06:09 -0000 1.11 > +++ usr.sbin/tcpdump/privsep.h 4 Mar 2019 22:30:32 -0000 > @@ -45,11 +45,11 @@ __dead void priv_exec(int, char **); > void priv_init_done(void); > > int setfilter(int, int, char *); > -int pcap_live(const char *, int, int, u_int, u_int); > +int pcap_live(const char *, int, int, u_int, u_int, u_int); > > struct bpf_program *priv_pcap_setfilter(pcap_t *, int, u_int32_t); > pcap_t *priv_pcap_live(const char *, int, int, int, char *, u_int, > - u_int); > + u_int, u_int); > pcap_t *priv_pcap_offline(const char *, char *); > > size_t priv_gethostbyaddr(char *, size_t, int, char *, size_t); > Index: usr.sbin/tcpdump/privsep_pcap.c > =================================================================== > RCS file: /cvs/src/usr.sbin/tcpdump/privsep_pcap.c,v > retrieving revision 1.23 > diff -u -p -r1.23 privsep_pcap.c > --- usr.sbin/tcpdump/privsep_pcap.c 17 Nov 2018 16:52:02 -0000 1.23 > +++ usr.sbin/tcpdump/privsep_pcap.c 4 Mar 2019 22:30:32 -0000 > @@ -173,7 +173,7 @@ priv_pcap_setfilter(pcap_t *hpcap, int o > /* privileged part of priv_pcap_live */ > int > pcap_live(const char *device, int snaplen, int promisc, u_int dlt, > - u_int dirfilt) > + u_int dirfilt, u_int fildrop) > { > int fd; > struct ifreq ifr; > @@ -201,6 +201,9 @@ pcap_live(const char *device, int snaple > if (ioctl(fd, BIOCSDIRFILT, &dirfilt) < 0) > goto error; > > + if (ioctl(fd, BIOCSFILDROP, &fildrop) < 0) > + goto error; > + > /* lock the descriptor */ > if (ioctl(fd, BIOCLOCK, NULL) < 0) > goto error; > @@ -218,7 +221,7 @@ pcap_live(const char *device, int snaple > */ > pcap_t * > priv_pcap_live(const char *dev, int slen, int prom, int to_ms, > - char *ebuf, u_int dlt, u_int dirfilt) > + char *ebuf, u_int dlt, u_int dirfilt, u_int fildrop) > { > int fd, err; > struct bpf_version bv; > @@ -247,6 +250,7 @@ priv_pcap_live(const char *dev, int slen > must_write(priv_fd, &prom, sizeof(int)); > must_write(priv_fd, &dlt, sizeof(u_int)); > must_write(priv_fd, &dirfilt, sizeof(u_int)); > + must_write(priv_fd, &fildrop, sizeof(fildrop)); > write_string(priv_fd, dev); > > fd = receive_fd(priv_fd); > Index: usr.sbin/tcpdump/tcpdump.8 > =================================================================== > RCS file: /cvs/src/usr.sbin/tcpdump/tcpdump.8,v > retrieving revision 1.99 > diff -u -p -r1.99 tcpdump.8 > --- usr.sbin/tcpdump/tcpdump.8 6 Jul 2018 09:59:12 -0000 1.99 > +++ usr.sbin/tcpdump/tcpdump.8 4 Mar 2019 22:30:32 -0000 > @@ -29,6 +29,7 @@ > .Nm tcpdump > .Op Fl AadefILlNnOopqStvXx > .Op Fl c Ar count > +.Op Fl B Ar fildrop > .Op Fl D Ar direction > .Op Fl E Oo Ar espalg : Oc Ns Ar espkey > .Op Fl F Ar file > @@ -58,6 +59,23 @@ The smaller of the entire packet or > bytes will be printed. > .It Fl a > Attempt to convert network and broadcast addresses to names. > +.It Fl B Ar fildrop > +Configure the drop action specified by > +.A fildrop > +to be used when the filter expression matches a packet. > +The actions are: > +.Pp > +.Bl -tag -width "capture" -offset indent -compact > +.It Cm pass > +Matching packets are accepted and captured. > +.It Cm capture > +Matching packets are dropped and captured. > +.It Cm drop > +Matching packets are dropped and not captured. > +.El > +.Pp > +The default action is > +.Cm pass . > .It Fl c Ar count > Exit after receiving > .Ar count > Index: usr.sbin/tcpdump/tcpdump.c > =================================================================== > RCS file: /cvs/src/usr.sbin/tcpdump/tcpdump.c,v > retrieving revision 1.88 > diff -u -p -r1.88 tcpdump.c > --- usr.sbin/tcpdump/tcpdump.c 8 Nov 2018 14:06:09 -0000 1.88 > +++ usr.sbin/tcpdump/tcpdump.c 4 Mar 2019 22:30:32 -0000 > @@ -61,6 +61,7 @@ > > int Aflag; /* dump ascii */ > int aflag; /* translate network and broadcast addresses */ > +int Bflag; /* BPF fildrop setting */ > int dflag; /* print filter code */ > int eflag; /* print ethernet header */ > int fflag; /* don't translate "foreign" IP address */ > @@ -231,7 +232,7 @@ main(int argc, char **argv) > > opterr = 0; > while ((op = getopt(argc, argv, > - "Aac:D:deE:fF:i:IlLnNOopqr:s:StT:vw:xXy:Y")) != -1) > + "AaB:c:D:deE:fF:i:IlLnNOopqr:s:StT:vw:xXy:Y")) != -1) > switch (op) { > > case 'A': > @@ -243,6 +244,19 @@ main(int argc, char **argv) > aflag = 1; > break; > > + case 'B': > + if (strcasecmp(optarg, "pass") == 0) > + Bflag = BPF_FILDROP_PASS; > + else if (strcasecmp(optarg, "capture") == 0) > + Bflag = BPF_FILDROP_CAPTURE; > + else if (strcasecmp(optarg, "drop") == 0) > + Bflag = BPF_FILDROP_DROP; > + else { > + error("invalid BPF fildrop option: %s", > + optarg); > + } > + break; > + > case 'c': > cnt = strtonum(optarg, 1, INT_MAX, &errstr); > if (errstr) > @@ -440,7 +454,7 @@ main(int argc, char **argv) > error("%s", ebuf); > } > pd = priv_pcap_live(device, snaplen, !pflag, 1000, ebuf, > - dlt, dirfilt); > + dlt, dirfilt, Bflag); > if (pd == NULL) > error("%s", ebuf); > > @@ -700,7 +714,7 @@ __dead void > usage(void) > { > (void)fprintf(stderr, > -"Usage: %s [-AadefILlNnOopqStvXx] [-c count] [-D direction]\n", > +"Usage: %s [-AadefILlNnOopqStvXx] [-B fildrop] [-c count] [-D direction]\n", > program_name); > (void)fprintf(stderr, > "\t [-E [espalg:]espkey] [-F file] [-i interface] [-r file]\n"); >