On Wed, Mar 13, 2019 at 11:32:36AM +0100, Mike Belopuhov wrote: > > David Gwynne writes: > > > On Tue, Mar 05, 2019 at 12:03:05PM +1000, David Gwynne wrote: > >> this extends the fildrop mechanism so you can drop the packets with bpf > >> using the existing fildrop method, but with an extra tweak so you can > >> avoid the cost of copying packets to userland. > >> > >> i wanted to quickly drop some packets in the rx interrupt path to try > >> and prioritise some traffic getting processed by the system. the initial > >> version was going to use weird custom DLTs and extra bpf interface > >> pointers and stuff, but most of the glue is already in place with > >> the fildrop functionality. > >> > >> this also adds a bit to tcpdump so you can set a fildrop action. it > >> means tcpdump can be used as a quick and dirty firewall. > > > > there's a bit more discussion about this that i should have included in > > my original email. > > > > firstly, the functionality it offers. this effectively offers a firewall > > with the ability to filter arbitrary packets. this has significant > > overlap with the functionality that pf offers, but there are a couple of > > important differences. pf only handles IP traffic, but we don't > > really have a good story when it comes to filtering non-ip. we could > > implement something like pf for the next protocol that people need to > > manage, but what is that next protocol? pf like implies a highly > > optimised but constrained set of filters that deeply understands the > > protocol it is handling. is that next protol ieee1905p? cdp? ipx? > > macsec? where should that protocol be filtered in the stack? > > > > im arguing that bpf with fildrop has the benefit of already existing, > > it's in place, and it already has the ability to be configured with > > arbitrary policy. considering we've got this far without handling > > non-ip, spending more time on it seems unjustified. > > > > secondly, the performance aspects of this diff. > > > > bpf allows for arbitrarily complicated filters, so it is entirely > > possible to slow your box down a lot by writing really complicated > > filters. this is in comparison to pf where each rule has a limit > > on how much work it will do, which is also mitigated by the ruleset > > optimiser and skip steps. i don't have a good answer to that except to > > say you can already add such filters to bpf, they just don't do anything > > except copy packets at the moment. > > > > another interesting performance consideration is that bpf runs a lot > > earlier than pf, so filtering packets with bpf can avoid a lot of work > > in the stack. if you want to pass IP statefully, pf is a much better > > hammer, but to drop packets up front bpf is interesting. > > > > for example, thanks to hrvoje popovski i now have a setup where im > > pushing ~7 million packets per second through a box to do performance > > measurements. those packets are udp from random ips to port 7 on > > another set of random ips. if i have the following rule in pf.conf: > > > > block in quick proto udp to port 7 > > > > i can rx and drop about 550kpps. if im sshed in using another > > interface, the system is super sluggish over that shell. > > > > if i use this diff and run the following; > > > > # tcpdump -B drop -i ix1 udp and port 7 > > > > i'm dropping about 1.2 million pps, and the box is responsive when sshed > > in using another interface. > > > > so, to summarise, bpf can already be used to drop packets, this is just > > a tweak to make it faster, and a tweak so tcpdump can be used to set up > > that filtering. > > > > I think this is a great development. Diff looks good as well.
I agree. OK claudio@ > >> Index: sys/net/bpf.c > >> =================================================================== > >> RCS file: /cvs/src/sys/net/bpf.c,v > >> retrieving revision 1.170 > >> diff -u -p -r1.170 bpf.c > >> --- sys/net/bpf.c 13 Jul 2018 08:51:15 -0000 1.170 > >> +++ sys/net/bpf.c 4 Mar 2019 22:30:32 -0000 > >> @@ -926,9 +926,20 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t > >> *(u_int *)addr = d->bd_fildrop; > >> break; > >> > >> - case BIOCSFILDROP: /* set "filter-drop" flag */ > >> - d->bd_fildrop = *(u_int *)addr ? 1 : 0; > >> + case BIOCSFILDROP: { /* set "filter-drop" flag */ > >> + unsigned int fildrop = *(u_int *)addr; > >> + switch (fildrop) { > >> + case BPF_FILDROP_PASS: > >> + case BPF_FILDROP_CAPTURE: > >> + case BPF_FILDROP_DROP: > >> + d->bd_fildrop = fildrop; > >> + break; > >> + default: > >> + error = EINVAL; > >> + break; > >> + } > >> break; > >> + } > >> > >> case BIOCGDIRFILT: /* get direction filter */ > >> *(u_int *)addr = d->bd_dirfilt; > >> @@ -1261,23 +1272,26 @@ _bpf_mtap(caddr_t arg, const struct mbuf > >> pktlen += m0->m_len; > >> > >> SRPL_FOREACH(d, &sr, &bp->bif_dlist, bd_next) { > >> + struct srp_ref bsr; > >> + struct bpf_program *bf; > >> + struct bpf_insn *fcode = NULL; > >> + > >> atomic_inc_long(&d->bd_rcount); > >> > >> - if ((direction & d->bd_dirfilt) != 0) > >> - slen = 0; > >> - else { > >> - struct srp_ref bsr; > >> - struct bpf_program *bf; > >> - struct bpf_insn *fcode = NULL; > >> - > >> - bf = srp_enter(&bsr, &d->bd_rfilter); > >> - if (bf != NULL) > >> - fcode = bf->bf_insns; > >> - slen = bpf_mfilter(fcode, m, pktlen); > >> - srp_leave(&bsr); > >> - } > >> + if (ISSET(d->bd_dirfilt, direction)) > >> + continue; > >> + > >> + bf = srp_enter(&bsr, &d->bd_rfilter); > >> + if (bf != NULL) > >> + fcode = bf->bf_insns; > >> + slen = bpf_mfilter(fcode, m, pktlen); > >> + srp_leave(&bsr); > >> > >> - if (slen > 0) { > >> + if (slen == 0) > >> + continue; > >> + if (d->bd_fildrop != BPF_FILDROP_PASS) > >> + drop = 1; > >> + if (d->bd_fildrop != BPF_FILDROP_DROP) { > >> if (!gottime++) > >> microtime(&tv); > >> > >> @@ -1285,9 +1299,6 @@ _bpf_mtap(caddr_t arg, const struct mbuf > >> bpf_catchpacket(d, (u_char *)m, pktlen, slen, cpfn, > >> &tv); > >> mtx_leave(&d->bd_mtx); > >> - > >> - if (d->bd_fildrop) > >> - drop = 1; > >> } > >> } > >> SRPL_LEAVE(&sr); > >> Index: sys/net/bpf.h > >> =================================================================== > >> RCS file: /cvs/src/sys/net/bpf.h,v > >> retrieving revision 1.65 > >> diff -u -p -r1.65 bpf.h > >> --- sys/net/bpf.h 3 Feb 2018 13:37:37 -0000 1.65 > >> +++ sys/net/bpf.h 4 Mar 2019 22:30:32 -0000 > >> @@ -126,6 +126,13 @@ struct bpf_version { > >> #define BPF_DIRECTION_IN 1 > >> #define BPF_DIRECTION_OUT (1<<1) > >> > >> +/* > >> + * Values for BIOCGFILDROP/BIOCSFILDROP > >> + */ > >> +#define BPF_FILDROP_PASS 0 /* capture, pass */ > >> +#define BPF_FILDROP_CAPTURE 1 /* capture, drop */ > >> +#define BPF_FILDROP_DROP 2 /* no capture, drop */ > >> + > >> struct bpf_timeval { > >> u_int32_t tv_sec; > >> u_int32_t tv_usec; > >> Index: share/man/man4/bpf.4 > >> =================================================================== > >> RCS file: /cvs/src/share/man/man4/bpf.4,v > >> retrieving revision 1.38 > >> diff -u -p -r1.38 bpf.4 > >> --- share/man/man4/bpf.4 28 Apr 2016 19:07:19 -0000 1.38 > >> +++ share/man/man4/bpf.4 4 Mar 2019 22:30:32 -0000 > >> @@ -391,11 +391,24 @@ This flag is initialized to zero by defa > >> .Pp > >> .It Dv BIOCSFILDROP Fa "u_int *" > >> .It Dv BIOCGFILDROP Fa "u_int *" > >> -Sets or gets the status of the > >> +Sets or gets the > >> .Dq filter drop > >> -flag. > >> -If non-zero, packets matching any filters will be reported to the > >> -associated interface so that they can be dropped. > >> +action. > >> +The supported actions for packets matching the filter are: > >> +.Pp > >> +.Bl -tag -width "BPF_FILDROP_CAPTURE" -compact > >> +.It Dv BPF_FILDROP_PASS > >> +Accept and capture > >> +.It Dv BPF_FILDROP_CAPTURE > >> +Drop and capture > >> +.It Dv BPF_FILDROP_DROP > >> +Drop and do not capture > >> +.El > >> +.Pp > >> +Packets matching any filter configured to drop packets will be > >> +reported to the associated interface so that they can be dropped. > >> +The default action is > >> +.Dv BPF_FILDROP_PASS . > >> .Pp > >> .It Dv BIOCSDIRFILT Fa "u_int *" > >> .It Dv BIOCGDIRFILT Fa "u_int *" > >> Index: usr.sbin/tcpdump/privsep.c > >> =================================================================== > >> RCS file: /cvs/src/usr.sbin/tcpdump/privsep.c,v > >> retrieving revision 1.52 > >> diff -u -p -r1.52 privsep.c > >> --- usr.sbin/tcpdump/privsep.c 17 Nov 2018 16:52:02 -0000 1.52 > >> +++ usr.sbin/tcpdump/privsep.c 4 Mar 2019 22:30:32 -0000 > >> @@ -224,7 +224,7 @@ priv_exec(int argc, char *argv[]) > >> /* parse the arguments for required options */ > >> opterr = 0; > >> while ((i = getopt(argc, argv, > >> - "ac:D:deE:fF:i:lLnNOopPqr:s:StT:vw:xXy:Y")) != -1) { > >> + "aB:c:D:deE:fF:i:lLnNOopPqr:s:StT:vw:xXy:Y")) != -1) { > >> switch (i) { > >> case 'n': > >> nflag++; > >> @@ -366,7 +366,7 @@ static void > >> impl_open_bpf(int fd, int *bpfd) > >> { > >> int snaplen, promisc, err; > >> - u_int dlt, dirfilt; > >> + u_int dlt, dirfilt, fildrop; > >> char device[IFNAMSIZ]; > >> size_t iflen; > >> > >> @@ -376,10 +376,11 @@ impl_open_bpf(int fd, int *bpfd) > >> must_read(fd, &promisc, sizeof(int)); > >> must_read(fd, &dlt, sizeof(u_int)); > >> must_read(fd, &dirfilt, sizeof(u_int)); > >> + must_read(fd, &fildrop, sizeof(fildrop)); > >> iflen = read_string(fd, device, sizeof(device), __func__); > >> if (iflen == 0) > >> errx(1, "Invalid interface size specified"); > >> - *bpfd = pcap_live(device, snaplen, promisc, dlt, dirfilt); > >> + *bpfd = pcap_live(device, snaplen, promisc, dlt, dirfilt, fildrop); > >> err = errno; > >> if (*bpfd < 0) > >> logmsg(LOG_DEBUG, > >> Index: usr.sbin/tcpdump/privsep.h > >> =================================================================== > >> RCS file: /cvs/src/usr.sbin/tcpdump/privsep.h,v > >> retrieving revision 1.11 > >> diff -u -p -r1.11 privsep.h > >> --- usr.sbin/tcpdump/privsep.h 8 Nov 2018 14:06:09 -0000 1.11 > >> +++ usr.sbin/tcpdump/privsep.h 4 Mar 2019 22:30:32 -0000 > >> @@ -45,11 +45,11 @@ __dead void priv_exec(int, char **); > >> void priv_init_done(void); > >> > >> int setfilter(int, int, char *); > >> -int pcap_live(const char *, int, int, u_int, u_int); > >> +int pcap_live(const char *, int, int, u_int, u_int, u_int); > >> > >> struct bpf_program *priv_pcap_setfilter(pcap_t *, int, u_int32_t); > >> pcap_t *priv_pcap_live(const char *, int, int, int, char *, u_int, > >> - u_int); > >> + u_int, u_int); > >> pcap_t *priv_pcap_offline(const char *, char *); > >> > >> size_t priv_gethostbyaddr(char *, size_t, int, char *, size_t); > >> Index: usr.sbin/tcpdump/privsep_pcap.c > >> =================================================================== > >> RCS file: /cvs/src/usr.sbin/tcpdump/privsep_pcap.c,v > >> retrieving revision 1.23 > >> diff -u -p -r1.23 privsep_pcap.c > >> --- usr.sbin/tcpdump/privsep_pcap.c 17 Nov 2018 16:52:02 -0000 > >> 1.23 > >> +++ usr.sbin/tcpdump/privsep_pcap.c 4 Mar 2019 22:30:32 -0000 > >> @@ -173,7 +173,7 @@ priv_pcap_setfilter(pcap_t *hpcap, int o > >> /* privileged part of priv_pcap_live */ > >> int > >> pcap_live(const char *device, int snaplen, int promisc, u_int dlt, > >> - u_int dirfilt) > >> + u_int dirfilt, u_int fildrop) > >> { > >> int fd; > >> struct ifreq ifr; > >> @@ -201,6 +201,9 @@ pcap_live(const char *device, int snaple > >> if (ioctl(fd, BIOCSDIRFILT, &dirfilt) < 0) > >> goto error; > >> > >> + if (ioctl(fd, BIOCSFILDROP, &fildrop) < 0) > >> + goto error; > >> + > >> /* lock the descriptor */ > >> if (ioctl(fd, BIOCLOCK, NULL) < 0) > >> goto error; > >> @@ -218,7 +221,7 @@ pcap_live(const char *device, int snaple > >> */ > >> pcap_t * > >> priv_pcap_live(const char *dev, int slen, int prom, int to_ms, > >> - char *ebuf, u_int dlt, u_int dirfilt) > >> + char *ebuf, u_int dlt, u_int dirfilt, u_int fildrop) > >> { > >> int fd, err; > >> struct bpf_version bv; > >> @@ -247,6 +250,7 @@ priv_pcap_live(const char *dev, int slen > >> must_write(priv_fd, &prom, sizeof(int)); > >> must_write(priv_fd, &dlt, sizeof(u_int)); > >> must_write(priv_fd, &dirfilt, sizeof(u_int)); > >> + must_write(priv_fd, &fildrop, sizeof(fildrop)); > >> write_string(priv_fd, dev); > >> > >> fd = receive_fd(priv_fd); > >> Index: usr.sbin/tcpdump/tcpdump.8 > >> =================================================================== > >> RCS file: /cvs/src/usr.sbin/tcpdump/tcpdump.8,v > >> retrieving revision 1.99 > >> diff -u -p -r1.99 tcpdump.8 > >> --- usr.sbin/tcpdump/tcpdump.8 6 Jul 2018 09:59:12 -0000 1.99 > >> +++ usr.sbin/tcpdump/tcpdump.8 4 Mar 2019 22:30:32 -0000 > >> @@ -29,6 +29,7 @@ > >> .Nm tcpdump > >> .Op Fl AadefILlNnOopqStvXx > >> .Op Fl c Ar count > >> +.Op Fl B Ar fildrop > >> .Op Fl D Ar direction > >> .Op Fl E Oo Ar espalg : Oc Ns Ar espkey > >> .Op Fl F Ar file > >> @@ -58,6 +59,23 @@ The smaller of the entire packet or > >> bytes will be printed. > >> .It Fl a > >> Attempt to convert network and broadcast addresses to names. > >> +.It Fl B Ar fildrop > >> +Configure the drop action specified by > >> +.A fildrop > >> +to be used when the filter expression matches a packet. > >> +The actions are: > >> +.Pp > >> +.Bl -tag -width "capture" -offset indent -compact > >> +.It Cm pass > >> +Matching packets are accepted and captured. > >> +.It Cm capture > >> +Matching packets are dropped and captured. > >> +.It Cm drop > >> +Matching packets are dropped and not captured. > >> +.El > >> +.Pp > >> +The default action is > >> +.Cm pass . > >> .It Fl c Ar count > >> Exit after receiving > >> .Ar count > >> Index: usr.sbin/tcpdump/tcpdump.c > >> =================================================================== > >> RCS file: /cvs/src/usr.sbin/tcpdump/tcpdump.c,v > >> retrieving revision 1.88 > >> diff -u -p -r1.88 tcpdump.c > >> --- usr.sbin/tcpdump/tcpdump.c 8 Nov 2018 14:06:09 -0000 1.88 > >> +++ usr.sbin/tcpdump/tcpdump.c 4 Mar 2019 22:30:32 -0000 > >> @@ -61,6 +61,7 @@ > >> > >> int Aflag; /* dump ascii */ > >> int aflag; /* translate network and broadcast > >> addresses */ > >> +int Bflag; /* BPF fildrop setting */ > >> int dflag; /* print filter code */ > >> int eflag; /* print ethernet header */ > >> int fflag; /* don't translate "foreign" IP address > >> */ > >> @@ -231,7 +232,7 @@ main(int argc, char **argv) > >> > >> opterr = 0; > >> while ((op = getopt(argc, argv, > >> - "Aac:D:deE:fF:i:IlLnNOopqr:s:StT:vw:xXy:Y")) != -1) > >> + "AaB:c:D:deE:fF:i:IlLnNOopqr:s:StT:vw:xXy:Y")) != -1) > >> switch (op) { > >> > >> case 'A': > >> @@ -243,6 +244,19 @@ main(int argc, char **argv) > >> aflag = 1; > >> break; > >> > >> + case 'B': > >> + if (strcasecmp(optarg, "pass") == 0) > >> + Bflag = BPF_FILDROP_PASS; > >> + else if (strcasecmp(optarg, "capture") == 0) > >> + Bflag = BPF_FILDROP_CAPTURE; > >> + else if (strcasecmp(optarg, "drop") == 0) > >> + Bflag = BPF_FILDROP_DROP; > >> + else { > >> + error("invalid BPF fildrop option: %s", > >> + optarg); > >> + } > >> + break; > >> + > >> case 'c': > >> cnt = strtonum(optarg, 1, INT_MAX, &errstr); > >> if (errstr) > >> @@ -440,7 +454,7 @@ main(int argc, char **argv) > >> error("%s", ebuf); > >> } > >> pd = priv_pcap_live(device, snaplen, !pflag, 1000, ebuf, > >> - dlt, dirfilt); > >> + dlt, dirfilt, Bflag); > >> if (pd == NULL) > >> error("%s", ebuf); > >> > >> @@ -700,7 +714,7 @@ __dead void > >> usage(void) > >> { > >> (void)fprintf(stderr, > >> -"Usage: %s [-AadefILlNnOopqStvXx] [-c count] [-D direction]\n", > >> +"Usage: %s [-AadefILlNnOopqStvXx] [-B fildrop] [-c count] [-D > >> direction]\n", > >> program_name); > >> (void)fprintf(stderr, > >> "\t [-E [espalg:]espkey] [-F file] [-i interface] [-r file]\n"); > >> > -- :wq Claudio