David Gwynne writes:
> On Tue, Mar 05, 2019 at 12:03:05PM +1000, David Gwynne wrote: >> this extends the fildrop mechanism so you can drop the packets with bpf >> using the existing fildrop method, but with an extra tweak so you can >> avoid the cost of copying packets to userland. >> >> i wanted to quickly drop some packets in the rx interrupt path to try >> and prioritise some traffic getting processed by the system. the initial >> version was going to use weird custom DLTs and extra bpf interface >> pointers and stuff, but most of the glue is already in place with >> the fildrop functionality. >> >> this also adds a bit to tcpdump so you can set a fildrop action. it >> means tcpdump can be used as a quick and dirty firewall. > > there's a bit more discussion about this that i should have included in > my original email. > > firstly, the functionality it offers. this effectively offers a firewall > with the ability to filter arbitrary packets. this has significant > overlap with the functionality that pf offers, but there are a couple of > important differences. pf only handles IP traffic, but we don't > really have a good story when it comes to filtering non-ip. we could > implement something like pf for the next protocol that people need to > manage, but what is that next protocol? pf like implies a highly > optimised but constrained set of filters that deeply understands the > protocol it is handling. is that next protol ieee1905p? cdp? ipx? > macsec? where should that protocol be filtered in the stack? > > im arguing that bpf with fildrop has the benefit of already existing, > it's in place, and it already has the ability to be configured with > arbitrary policy. considering we've got this far without handling > non-ip, spending more time on it seems unjustified. > > secondly, the performance aspects of this diff. > > bpf allows for arbitrarily complicated filters, so it is entirely > possible to slow your box down a lot by writing really complicated > filters. this is in comparison to pf where each rule has a limit > on how much work it will do, which is also mitigated by the ruleset > optimiser and skip steps. i don't have a good answer to that except to > say you can already add such filters to bpf, they just don't do anything > except copy packets at the moment. > > another interesting performance consideration is that bpf runs a lot > earlier than pf, so filtering packets with bpf can avoid a lot of work > in the stack. if you want to pass IP statefully, pf is a much better > hammer, but to drop packets up front bpf is interesting. > > for example, thanks to hrvoje popovski i now have a setup where im > pushing ~7 million packets per second through a box to do performance > measurements. those packets are udp from random ips to port 7 on > another set of random ips. if i have the following rule in pf.conf: > > block in quick proto udp to port 7 > > i can rx and drop about 550kpps. if im sshed in using another > interface, the system is super sluggish over that shell. > > if i use this diff and run the following; > > # tcpdump -B drop -i ix1 udp and port 7 > > i'm dropping about 1.2 million pps, and the box is responsive when sshed > in using another interface. > > so, to summarise, bpf can already be used to drop packets, this is just > a tweak to make it faster, and a tweak so tcpdump can be used to set up > that filtering. > I think this is a great development. Diff looks good as well. >> Index: sys/net/bpf.c >> =================================================================== >> RCS file: /cvs/src/sys/net/bpf.c,v >> retrieving revision 1.170 >> diff -u -p -r1.170 bpf.c >> --- sys/net/bpf.c 13 Jul 2018 08:51:15 -0000 1.170 >> +++ sys/net/bpf.c 4 Mar 2019 22:30:32 -0000 >> @@ -926,9 +926,20 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t >> *(u_int *)addr = d->bd_fildrop; >> break; >> >> - case BIOCSFILDROP: /* set "filter-drop" flag */ >> - d->bd_fildrop = *(u_int *)addr ? 1 : 0; >> + case BIOCSFILDROP: { /* set "filter-drop" flag */ >> + unsigned int fildrop = *(u_int *)addr; >> + switch (fildrop) { >> + case BPF_FILDROP_PASS: >> + case BPF_FILDROP_CAPTURE: >> + case BPF_FILDROP_DROP: >> + d->bd_fildrop = fildrop; >> + break; >> + default: >> + error = EINVAL; >> + break; >> + } >> break; >> + } >> >> case BIOCGDIRFILT: /* get direction filter */ >> *(u_int *)addr = d->bd_dirfilt; >> @@ -1261,23 +1272,26 @@ _bpf_mtap(caddr_t arg, const struct mbuf >> pktlen += m0->m_len; >> >> SRPL_FOREACH(d, &sr, &bp->bif_dlist, bd_next) { >> + struct srp_ref bsr; >> + struct bpf_program *bf; >> + struct bpf_insn *fcode = NULL; >> + >> atomic_inc_long(&d->bd_rcount); >> >> - if ((direction & d->bd_dirfilt) != 0) >> - slen = 0; >> - else { >> - struct srp_ref bsr; >> - struct bpf_program *bf; >> - struct bpf_insn *fcode = NULL; >> - >> - bf = srp_enter(&bsr, &d->bd_rfilter); >> - if (bf != NULL) >> - fcode = bf->bf_insns; >> - slen = bpf_mfilter(fcode, m, pktlen); >> - srp_leave(&bsr); >> - } >> + if (ISSET(d->bd_dirfilt, direction)) >> + continue; >> + >> + bf = srp_enter(&bsr, &d->bd_rfilter); >> + if (bf != NULL) >> + fcode = bf->bf_insns; >> + slen = bpf_mfilter(fcode, m, pktlen); >> + srp_leave(&bsr); >> >> - if (slen > 0) { >> + if (slen == 0) >> + continue; >> + if (d->bd_fildrop != BPF_FILDROP_PASS) >> + drop = 1; >> + if (d->bd_fildrop != BPF_FILDROP_DROP) { >> if (!gottime++) >> microtime(&tv); >> >> @@ -1285,9 +1299,6 @@ _bpf_mtap(caddr_t arg, const struct mbuf >> bpf_catchpacket(d, (u_char *)m, pktlen, slen, cpfn, >> &tv); >> mtx_leave(&d->bd_mtx); >> - >> - if (d->bd_fildrop) >> - drop = 1; >> } >> } >> SRPL_LEAVE(&sr); >> Index: sys/net/bpf.h >> =================================================================== >> RCS file: /cvs/src/sys/net/bpf.h,v >> retrieving revision 1.65 >> diff -u -p -r1.65 bpf.h >> --- sys/net/bpf.h 3 Feb 2018 13:37:37 -0000 1.65 >> +++ sys/net/bpf.h 4 Mar 2019 22:30:32 -0000 >> @@ -126,6 +126,13 @@ struct bpf_version { >> #define BPF_DIRECTION_IN 1 >> #define BPF_DIRECTION_OUT (1<<1) >> >> +/* >> + * Values for BIOCGFILDROP/BIOCSFILDROP >> + */ >> +#define BPF_FILDROP_PASS 0 /* capture, pass */ >> +#define BPF_FILDROP_CAPTURE 1 /* capture, drop */ >> +#define BPF_FILDROP_DROP 2 /* no capture, drop */ >> + >> struct bpf_timeval { >> u_int32_t tv_sec; >> u_int32_t tv_usec; >> Index: share/man/man4/bpf.4 >> =================================================================== >> RCS file: /cvs/src/share/man/man4/bpf.4,v >> retrieving revision 1.38 >> diff -u -p -r1.38 bpf.4 >> --- share/man/man4/bpf.4 28 Apr 2016 19:07:19 -0000 1.38 >> +++ share/man/man4/bpf.4 4 Mar 2019 22:30:32 -0000 >> @@ -391,11 +391,24 @@ This flag is initialized to zero by defa >> .Pp >> .It Dv BIOCSFILDROP Fa "u_int *" >> .It Dv BIOCGFILDROP Fa "u_int *" >> -Sets or gets the status of the >> +Sets or gets the >> .Dq filter drop >> -flag. >> -If non-zero, packets matching any filters will be reported to the >> -associated interface so that they can be dropped. >> +action. >> +The supported actions for packets matching the filter are: >> +.Pp >> +.Bl -tag -width "BPF_FILDROP_CAPTURE" -compact >> +.It Dv BPF_FILDROP_PASS >> +Accept and capture >> +.It Dv BPF_FILDROP_CAPTURE >> +Drop and capture >> +.It Dv BPF_FILDROP_DROP >> +Drop and do not capture >> +.El >> +.Pp >> +Packets matching any filter configured to drop packets will be >> +reported to the associated interface so that they can be dropped. >> +The default action is >> +.Dv BPF_FILDROP_PASS . >> .Pp >> .It Dv BIOCSDIRFILT Fa "u_int *" >> .It Dv BIOCGDIRFILT Fa "u_int *" >> Index: usr.sbin/tcpdump/privsep.c >> =================================================================== >> RCS file: /cvs/src/usr.sbin/tcpdump/privsep.c,v >> retrieving revision 1.52 >> diff -u -p -r1.52 privsep.c >> --- usr.sbin/tcpdump/privsep.c 17 Nov 2018 16:52:02 -0000 1.52 >> +++ usr.sbin/tcpdump/privsep.c 4 Mar 2019 22:30:32 -0000 >> @@ -224,7 +224,7 @@ priv_exec(int argc, char *argv[]) >> /* parse the arguments for required options */ >> opterr = 0; >> while ((i = getopt(argc, argv, >> - "ac:D:deE:fF:i:lLnNOopPqr:s:StT:vw:xXy:Y")) != -1) { >> + "aB:c:D:deE:fF:i:lLnNOopPqr:s:StT:vw:xXy:Y")) != -1) { >> switch (i) { >> case 'n': >> nflag++; >> @@ -366,7 +366,7 @@ static void >> impl_open_bpf(int fd, int *bpfd) >> { >> int snaplen, promisc, err; >> - u_int dlt, dirfilt; >> + u_int dlt, dirfilt, fildrop; >> char device[IFNAMSIZ]; >> size_t iflen; >> >> @@ -376,10 +376,11 @@ impl_open_bpf(int fd, int *bpfd) >> must_read(fd, &promisc, sizeof(int)); >> must_read(fd, &dlt, sizeof(u_int)); >> must_read(fd, &dirfilt, sizeof(u_int)); >> + must_read(fd, &fildrop, sizeof(fildrop)); >> iflen = read_string(fd, device, sizeof(device), __func__); >> if (iflen == 0) >> errx(1, "Invalid interface size specified"); >> - *bpfd = pcap_live(device, snaplen, promisc, dlt, dirfilt); >> + *bpfd = pcap_live(device, snaplen, promisc, dlt, dirfilt, fildrop); >> err = errno; >> if (*bpfd < 0) >> logmsg(LOG_DEBUG, >> Index: usr.sbin/tcpdump/privsep.h >> =================================================================== >> RCS file: /cvs/src/usr.sbin/tcpdump/privsep.h,v >> retrieving revision 1.11 >> diff -u -p -r1.11 privsep.h >> --- usr.sbin/tcpdump/privsep.h 8 Nov 2018 14:06:09 -0000 1.11 >> +++ usr.sbin/tcpdump/privsep.h 4 Mar 2019 22:30:32 -0000 >> @@ -45,11 +45,11 @@ __dead void priv_exec(int, char **); >> void priv_init_done(void); >> >> int setfilter(int, int, char *); >> -int pcap_live(const char *, int, int, u_int, u_int); >> +int pcap_live(const char *, int, int, u_int, u_int, u_int); >> >> struct bpf_program *priv_pcap_setfilter(pcap_t *, int, u_int32_t); >> pcap_t *priv_pcap_live(const char *, int, int, int, char *, u_int, >> - u_int); >> + u_int, u_int); >> pcap_t *priv_pcap_offline(const char *, char *); >> >> size_t priv_gethostbyaddr(char *, size_t, int, char *, size_t); >> Index: usr.sbin/tcpdump/privsep_pcap.c >> =================================================================== >> RCS file: /cvs/src/usr.sbin/tcpdump/privsep_pcap.c,v >> retrieving revision 1.23 >> diff -u -p -r1.23 privsep_pcap.c >> --- usr.sbin/tcpdump/privsep_pcap.c 17 Nov 2018 16:52:02 -0000 1.23 >> +++ usr.sbin/tcpdump/privsep_pcap.c 4 Mar 2019 22:30:32 -0000 >> @@ -173,7 +173,7 @@ priv_pcap_setfilter(pcap_t *hpcap, int o >> /* privileged part of priv_pcap_live */ >> int >> pcap_live(const char *device, int snaplen, int promisc, u_int dlt, >> - u_int dirfilt) >> + u_int dirfilt, u_int fildrop) >> { >> int fd; >> struct ifreq ifr; >> @@ -201,6 +201,9 @@ pcap_live(const char *device, int snaple >> if (ioctl(fd, BIOCSDIRFILT, &dirfilt) < 0) >> goto error; >> >> + if (ioctl(fd, BIOCSFILDROP, &fildrop) < 0) >> + goto error; >> + >> /* lock the descriptor */ >> if (ioctl(fd, BIOCLOCK, NULL) < 0) >> goto error; >> @@ -218,7 +221,7 @@ pcap_live(const char *device, int snaple >> */ >> pcap_t * >> priv_pcap_live(const char *dev, int slen, int prom, int to_ms, >> - char *ebuf, u_int dlt, u_int dirfilt) >> + char *ebuf, u_int dlt, u_int dirfilt, u_int fildrop) >> { >> int fd, err; >> struct bpf_version bv; >> @@ -247,6 +250,7 @@ priv_pcap_live(const char *dev, int slen >> must_write(priv_fd, &prom, sizeof(int)); >> must_write(priv_fd, &dlt, sizeof(u_int)); >> must_write(priv_fd, &dirfilt, sizeof(u_int)); >> + must_write(priv_fd, &fildrop, sizeof(fildrop)); >> write_string(priv_fd, dev); >> >> fd = receive_fd(priv_fd); >> Index: usr.sbin/tcpdump/tcpdump.8 >> =================================================================== >> RCS file: /cvs/src/usr.sbin/tcpdump/tcpdump.8,v >> retrieving revision 1.99 >> diff -u -p -r1.99 tcpdump.8 >> --- usr.sbin/tcpdump/tcpdump.8 6 Jul 2018 09:59:12 -0000 1.99 >> +++ usr.sbin/tcpdump/tcpdump.8 4 Mar 2019 22:30:32 -0000 >> @@ -29,6 +29,7 @@ >> .Nm tcpdump >> .Op Fl AadefILlNnOopqStvXx >> .Op Fl c Ar count >> +.Op Fl B Ar fildrop >> .Op Fl D Ar direction >> .Op Fl E Oo Ar espalg : Oc Ns Ar espkey >> .Op Fl F Ar file >> @@ -58,6 +59,23 @@ The smaller of the entire packet or >> bytes will be printed. >> .It Fl a >> Attempt to convert network and broadcast addresses to names. >> +.It Fl B Ar fildrop >> +Configure the drop action specified by >> +.A fildrop >> +to be used when the filter expression matches a packet. >> +The actions are: >> +.Pp >> +.Bl -tag -width "capture" -offset indent -compact >> +.It Cm pass >> +Matching packets are accepted and captured. >> +.It Cm capture >> +Matching packets are dropped and captured. >> +.It Cm drop >> +Matching packets are dropped and not captured. >> +.El >> +.Pp >> +The default action is >> +.Cm pass . >> .It Fl c Ar count >> Exit after receiving >> .Ar count >> Index: usr.sbin/tcpdump/tcpdump.c >> =================================================================== >> RCS file: /cvs/src/usr.sbin/tcpdump/tcpdump.c,v >> retrieving revision 1.88 >> diff -u -p -r1.88 tcpdump.c >> --- usr.sbin/tcpdump/tcpdump.c 8 Nov 2018 14:06:09 -0000 1.88 >> +++ usr.sbin/tcpdump/tcpdump.c 4 Mar 2019 22:30:32 -0000 >> @@ -61,6 +61,7 @@ >> >> int Aflag; /* dump ascii */ >> int aflag; /* translate network and broadcast addresses */ >> +int Bflag; /* BPF fildrop setting */ >> int dflag; /* print filter code */ >> int eflag; /* print ethernet header */ >> int fflag; /* don't translate "foreign" IP address */ >> @@ -231,7 +232,7 @@ main(int argc, char **argv) >> >> opterr = 0; >> while ((op = getopt(argc, argv, >> - "Aac:D:deE:fF:i:IlLnNOopqr:s:StT:vw:xXy:Y")) != -1) >> + "AaB:c:D:deE:fF:i:IlLnNOopqr:s:StT:vw:xXy:Y")) != -1) >> switch (op) { >> >> case 'A': >> @@ -243,6 +244,19 @@ main(int argc, char **argv) >> aflag = 1; >> break; >> >> + case 'B': >> + if (strcasecmp(optarg, "pass") == 0) >> + Bflag = BPF_FILDROP_PASS; >> + else if (strcasecmp(optarg, "capture") == 0) >> + Bflag = BPF_FILDROP_CAPTURE; >> + else if (strcasecmp(optarg, "drop") == 0) >> + Bflag = BPF_FILDROP_DROP; >> + else { >> + error("invalid BPF fildrop option: %s", >> + optarg); >> + } >> + break; >> + >> case 'c': >> cnt = strtonum(optarg, 1, INT_MAX, &errstr); >> if (errstr) >> @@ -440,7 +454,7 @@ main(int argc, char **argv) >> error("%s", ebuf); >> } >> pd = priv_pcap_live(device, snaplen, !pflag, 1000, ebuf, >> - dlt, dirfilt); >> + dlt, dirfilt, Bflag); >> if (pd == NULL) >> error("%s", ebuf); >> >> @@ -700,7 +714,7 @@ __dead void >> usage(void) >> { >> (void)fprintf(stderr, >> -"Usage: %s [-AadefILlNnOopqStvXx] [-c count] [-D direction]\n", >> +"Usage: %s [-AadefILlNnOopqStvXx] [-B fildrop] [-c count] [-D direction]\n", >> program_name); >> (void)fprintf(stderr, >> "\t [-E [espalg:]espkey] [-F file] [-i interface] [-r file]\n"); >>