I have attached the patch that we have been using. This is a patch for the
libpcap code in FreeBSD -CURRENT right now. It appears to work well, but
you folks might have some ideas for getting the necessary autoconf magic
in, as well make some possible architecture changes.
Let me know what you think. I have CC rwatson as well, as Robert has been
heavily involved in the development over the past year.
Thanks!
On Tue, Mar 25, 2008 at 01:11:40AM -0700, Guy Harris wrote:
> Bruce M Simpson wrote:
> >+1 here.
> >
> >Zero copy BPF has just gone into FreeBSD-CURRENT. It would be great to
> >have a snap which can do this too. Christian Peron (CC'd) has been
> >responsible for the code.
>
> That means that we *don't* want to take what we have now and release it,
> as we don't currently have any code that supports zero-copy BPF. We'd
> need to get a patch to add support for it, and add that, before
> releasing libpcap 1.0, if a goal is to get support for zero-copy BPF in
> libpcap 1.0.
Index: pcap-bpf.c
===================================================================
RCS file: /usr/ncvs/src/contrib/libpcap/pcap-bpf.c,v
retrieving revision 1.4
diff -u -r1.4 pcap-bpf.c
--- pcap-bpf.c 16 Oct 2007 02:07:55 -0000 1.4
+++ pcap-bpf.c 19 Mar 2008 13:24:17 -0000
@@ -30,6 +30,8 @@
#endif
#include <sys/param.h> /* optionally get BSD define */
+#include <sys/mman.h>
+#include <sys/poll.h>
#include <sys/time.h>
#include <sys/timeb.h>
#include <sys/socket.h>
@@ -86,6 +88,10 @@
#endif /* _AIX */
+#ifdef BIOCSETBUFMODE
+#include <machine/atomic.h>
+#endif
+
#include <ctype.h>
#include <errno.h>
#include <netdb.h>
@@ -139,6 +145,159 @@
return (0);
}
+#ifdef BIOCGETBUFMODE
+/*
+ * Zero-copy BPF buffer routines to check for and acknowledge BPF data in
+ * shared memory buffers.
+ *
+ * pcap_next_zbuf_shm(): Check for a newly available shared memory buffer,
+ * and set up p->buffer and cc to reflect one if available. Notice that if
+ * there was no prior buffer, we select zbuf1 as this will be the first
+ * buffer filled for a fresh BPF session.
+ */
+static int
+pcap_next_zbuf_shm(pcap_t *p, int *cc)
+{
+ struct bpf_zbuf_header *bzh;
+
+ if (p->zbuffer == p->zbuf2 || p->zbuffer == NULL) {
+ bzh = (struct bpf_zbuf_header *)p->zbuf1;
+ if (bzh->bzh_user_gen !=
+ atomic_load_acq_int(&bzh->bzh_kernel_gen)) {
+ p->bzh = bzh;
+ p->zbuffer = (u_char *)p->zbuf1;
+ p->buffer = p->zbuffer + sizeof(*bzh);
+ *cc = bzh->bzh_kernel_len;
+ return (1);
+ }
+ } else if (p->zbuffer == p->zbuf1) {
+ bzh = (struct bpf_zbuf_header *)p->zbuf2;
+ if (bzh->bzh_user_gen !=
+ atomic_load_acq_int(&bzh->bzh_kernel_gen)) {
+ p->bzh = bzh;
+ p->zbuffer = (u_char *)p->zbuf2;
+ p->buffer = p->zbuffer + sizeof(*bzh);
+ *cc = bzh->bzh_kernel_len;
+ return (1);
+ }
+ }
+ *cc = 0;
+ return (0);
+}
+
+/*
+ * pcap_next_zbuf() -- Similar to pcap_next_zbuf_shm(), except wait using
+ * select() for data or a timeout, and possibly force rotation of the buffer
+ * in the event we time out or are in immediate mode. Invoke the shared
+ * memory check before doing system calls in order to avoid doing avoidable
+ * work.
+ */
+static int
+pcap_next_zbuf(pcap_t *p, int *cc)
+{
+ struct bpf_zbuf bz;
+ struct timeval tv;
+ struct timespec cur;
+ fd_set r_set;
+ int data, r;
+ int tmout, expire;
+
+#define TSTOMILLI(ts) (((ts)->tv_sec * 1000) + ((ts)->tv_nsec / 1000000))
+ /*
+ * Start out by seeing whether anything is waiting by checking the
+ * next shared memory buffer for data.
+ */
+ data = pcap_next_zbuf_shm(p, cc);
+ if (data)
+ return (data);
+ /*
+ * If a previous sleep was interrupted due to signal delivery, make
+ * sure that the timeout gets adjusted accordingly. This requires
+ * that we analyze when the timeout should be been expired, and
+ * subtract the current time from that. If after this operation,
+ * our timeout is less then or equal to zero, handle it like a
+ * regular timeout.
+ */
+ tmout = p->to_ms;
+ if (tmout)
+ (void) clock_gettime(CLOCK_MONOTONIC, &cur);
+ if (p->interrupted && p->to_ms) {
+ expire = TSTOMILLI(&p->firstsel) + p->to_ms;
+ tmout = expire - TSTOMILLI(&cur);
+#undef TSTOMILLI
+ if (tmout <= 0) {
+ p->interrupted = 0;
+ data = pcap_next_zbuf_shm(p, cc);
+ if (data)
+ return (data);
+ if (ioctl(p->fd, BIOCROTZBUF, &bz) < 0) {
+ (void) snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
+ "BIOCROTZBUF: %s", strerror(errno));
+ return (-1);
+ }
+ return (pcap_next_zbuf_shm(p, cc));
+ }
+ }
+ /*
+ * No data in the buffer, so must use select() to wait for data or
+ * the next timeout.
+ */
+ FD_ZERO(&r_set);
+ FD_SET(p->fd, &r_set);
+ if (tmout != 0) {
+ tv.tv_sec = tmout / 1000;
+ tv.tv_usec = (tmout * 1000) % 1000000;
+ }
+ r = select(p->fd + 1, &r_set, NULL, NULL, p->to_ms != 0 ? &tv :
+ NULL);
+ if (r < 0 && errno == EINTR) {
+ if (!p->interrupted && p->to_ms) {
+ p->interrupted = 1;
+ p->firstsel = cur;
+ }
+ return (0);
+ } else if (r < 0) {
+ (void) snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
+ "select: %s", strerror(errno));
+ return (-1);
+ }
+ p->interrupted = 0;
+ /*
+ * Check again for data, which may exist now that we've either been
+ * woken up as a result of data or timed out. Try the "there's data"
+ * case first since it doesn't require a system call.
+ */
+ data = pcap_next_zbuf_shm(p, cc);
+ if (data)
+ return (data);
+
+ /*
+ * Try forcing a buffer rotation to dislodge timed out or immediate
+ * data.
+ */
+ if (ioctl(p->fd, BIOCROTZBUF, &bz) < 0) {
+ (void) snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
+ "BIOCROTZBUF: %s", strerror(errno));
+ return (-1);
+ }
+ return (pcap_next_zbuf_shm(p, cc));
+}
+
+/*
+ * Notify kernel that we are done with the buffer. We don't reset zbuffer so
+ * that we know which buffer to use next time around.
+ */
+static int
+pcap_ack_zbuf(pcap_t *p)
+{
+
+ atomic_store_rel_int(&p->bzh->bzh_user_gen, p->bzh->bzh_kernel_gen);
+ p->bzh = NULL;
+ p->buffer = NULL;
+ return (0);
+}
+#endif
+
static int
pcap_read_bpf(pcap_t *p, int cnt, pcap_handler callback, u_char *user)
{
@@ -147,6 +306,9 @@
register u_char *bp, *ep;
u_char *datap;
struct bpf_insn *fcode;
+#ifdef BIOCSETBUFMODE
+ int i;
+#endif
#ifdef PCAP_FDDIPAD
register int pad;
#endif
@@ -167,7 +329,27 @@
}
cc = p->cc;
if (p->cc == 0) {
- cc = read(p->fd, (char *)p->buffer, p->bufsize);
+ /*
+ * When reading without zero-copy from a file descriptor, we
+ * use a single buffer and return a length of data in the
+ * buffer. With zero-copy, we update the p->buffer pointer
+ * to point at whatever underlying buffer contains the next
+ * data and update cc to reflect the data found in the
+ * buffer.
+ */
+#ifdef BIOCSETBUFMODE
+ if (p->zerocopy) {
+ if (p->buffer != NULL)
+ pcap_ack_zbuf(p);
+ i = pcap_next_zbuf(p, &cc);
+ if (i == 0)
+ goto again;
+ if (i < 0)
+ return (-1);
+ } else
+#endif
+ cc = read(p->fd, (char *)p->buffer, p->bufsize);
+
if (cc < 0) {
/* Don't choke when we get ptraced */
switch (errno) {
@@ -609,6 +791,10 @@
struct bpf_insn total_insn;
struct bpf_program total_prog;
struct utsname osinfo;
+#ifdef BIOCSETBUFMODE
+ struct bpf_zbuf bz;
+ u_int bufmode, zbufmax;
+#endif
#ifdef HAVE_DAG_API
if (strstr(device, "dag")) {
@@ -646,41 +832,105 @@
goto bad;
}
+#ifdef BIOCSETBUFMODE
/*
- * Try finding a good size for the buffer; 32768 may be too
- * big, so keep cutting it in half until we find a size
- * that works, or run out of sizes to try. If the default
- * is larger, don't make it smaller.
- *
- * XXX - there should be a user-accessible hook to set the
- * initial buffer size.
+ * If the BPF extension to set buffer mode is present, try setting
+ * the mode to zero-copy. If that fails, use regular buffering. If
+ * it succeeds but other setup fails, return an error to the user.
*/
- if ((ioctl(fd, BIOCGBLEN, (caddr_t)&v) < 0) || v < 32768)
- v = 32768;
- for ( ; v != 0; v >>= 1) {
- /* Ignore the return value - this is because the call fails
- * on BPF systems that don't have kernel malloc. And if
- * the call fails, it's no big deal, we just continue to
- * use the standard buffer size.
- */
- (void) ioctl(fd, BIOCSBLEN, (caddr_t)&v);
+ bufmode = BPF_BUFMODE_ZBUF;
+ if (ioctl(fd, BIOCSETBUFMODE, (caddr_t)&bufmode) == 0) {
+ p->zerocopy = 1;
+ /*
+ * How to pick a buffer size: first, query the maximum buffer
+ * size supported by zero-copy. This also lets us quickly
+ * determine whether the kernel generally supports zero-copy.
+ * Then, query the default buffer size, which reflects kernel
+ * policy for a desired default. Round to the nearest page
+ * size.
+ */
+ if (ioctl(fd, BIOCGETZMAX, (caddr_t)&zbufmax) < 0) {
+ snprintf(ebuf, PCAP_ERRBUF_SIZE, "BIOCGETZMAX: %s",
+ pcap_strerror(errno));
+ goto bad;
+ }
+ if ((ioctl(fd, BIOCGBLEN, (caddr_t)&v) < 0) || v < 32768)
+ v = 32768;
+#ifndef roundup
+#define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) /* to any y */
+#endif
+ p->zbufsize = roundup(v, getpagesize());
+ if (p->zbufsize > zbufmax)
+ p->zbufsize = zbufmax;
+ p->zbuf1 = mmap(NULL, p->zbufsize, PROT_READ | PROT_WRITE,
+ MAP_ANON, -1, 0);
+ p->zbuf2 = mmap(NULL, p->zbufsize, PROT_READ | PROT_WRITE,
+ MAP_ANON, -1, 0);
+ if (p->zbuf1 == MAP_FAILED || p->zbuf2 == MAP_FAILED) {
+ snprintf(ebuf, PCAP_ERRBUF_SIZE, "mmap: %s",
+ pcap_strerror(errno));
+ goto bad;
+ }
+ bzero(&bz, sizeof(bz));
+ bz.bz_bufa = p->zbuf1;
+ bz.bz_bufb = p->zbuf2;
+ bz.bz_buflen = p->zbufsize;
+ if (ioctl(fd, BIOCSETZBUF, (caddr_t)&bz) < 0) {
+ snprintf(ebuf, PCAP_ERRBUF_SIZE, "BIOCSETZBUF: %s",
+ pcap_strerror(errno));
+ goto bad;
+ }
(void)strncpy(ifr.ifr_name, device, sizeof(ifr.ifr_name));
- if (ioctl(fd, BIOCSETIF, (caddr_t)&ifr) >= 0)
- break; /* that size worked; we're done */
-
- if (errno != ENOBUFS) {
+ if (ioctl(fd, BIOCSETIF, (caddr_t)&ifr) < 0) {
snprintf(ebuf, PCAP_ERRBUF_SIZE, "BIOCSETIF: %s: %s",
device, pcap_strerror(errno));
goto bad;
}
- }
+ v = p->zbufsize - sizeof(struct bpf_zbuf_header);
+ } else {
+#endif
- if (v == 0) {
- snprintf(ebuf, PCAP_ERRBUF_SIZE,
- "BIOCSBLEN: %s: No buffer size worked", device);
- goto bad;
+ /*
+ * Try finding a good size for the buffer; 32768 may be too
+ * big, so keep cutting it in half until we find a size
+ * that works, or run out of sizes to try. If the default
+ * is larger, don't make it smaller.
+ *
+ * XXX - there should be a user-accessible hook to set the
+ * initial buffer size.
+ */
+ if ((ioctl(fd, BIOCGBLEN, (caddr_t)&v) < 0) || v < 32768)
+ v = 32768;
+ for ( ; v != 0; v >>= 1) {
+ /* Ignore the return value - this is because the call
+ * fails on BPF systems that don't have kernel
+ * malloc. And if the call fails, it's no big deal,
+ * we just continue to use the standard buffer size.
+ */
+ (void) ioctl(fd, BIOCSBLEN, (caddr_t)&v);
+
+ (void)strncpy(ifr.ifr_name, device,
+ sizeof(ifr.ifr_name));
+ if (ioctl(fd, BIOCSETIF, (caddr_t)&ifr) >= 0)
+ break; /* that size worked; we're done */
+
+ if (errno != ENOBUFS) {
+ snprintf(ebuf, PCAP_ERRBUF_SIZE,
+ "BIOCSETIF: %s: %s",
+ device, pcap_strerror(errno));
+ goto bad;
+ }
+ }
+
+ if (v == 0) {
+ snprintf(ebuf, PCAP_ERRBUF_SIZE,
+ "BIOCSBLEN: %s: No buffer size worked", device);
+ goto bad;
+ }
+#ifdef BIOCSETBUFMODE
}
+#endif
/* Get the data link layer type. */
if (ioctl(fd, BIOCGDLT, (caddr_t)&v) < 0) {
@@ -855,7 +1105,8 @@
}
#endif
/* set timeout */
- if (to_ms != 0) {
+ p->to_ms = to_ms;
+ if (to_ms != 0 && !p->zerocopy) {
/*
* XXX - is this seconds/nanoseconds in AIX?
* (Treating it as such doesn't fix the timeout
@@ -870,6 +1121,9 @@
goto bad;
}
}
+#ifdef BIOCSETBUFMODE
+ p->timeout = to_ms;
+#endif
#ifdef _AIX
#ifdef BIOCIMMEDIATE
@@ -942,16 +1196,22 @@
goto bad;
}
p->bufsize = v;
- p->buffer = (u_char *)malloc(p->bufsize);
- if (p->buffer == NULL) {
- snprintf(ebuf, PCAP_ERRBUF_SIZE, "malloc: %s",
- pcap_strerror(errno));
- goto bad;
- }
+#ifdef BIOCSETBUFMODE
+ if (!p->zerocopy) {
+#endif
+ p->buffer = (u_char *)malloc(p->bufsize);
+ if (p->buffer == NULL) {
+ snprintf(ebuf, PCAP_ERRBUF_SIZE, "malloc: %s",
+ pcap_strerror(errno));
+ goto bad;
+ }
#ifdef _AIX
- /* For some strange reason this seems to prevent the EFAULT
- * problems we have experienced from AIX BPF. */
- memset(p->buffer, 0x0, p->bufsize);
+ /* For some strange reason this seems to prevent the EFAULT
+ * problems we have experienced from AIX BPF. */
+ memset(p->buffer, 0x0, p->bufsize);
+#endif
+#ifdef BIOCSETBUFMODE
+ }
#endif
/*
@@ -1036,7 +1296,22 @@
return (p);
bad:
+
(void)close(fd);
+#ifdef BIOCSETBUFMODE
+ /*
+ * In zero-copy mode, p->buffer is just a pointer into one of the two
+ * memory-mapped buffers, so no need to free it.
+ */
+ if (p->zerocopy) {
+ if (p->zbuf1 != MAP_FAILED && p->zbuf1 != NULL)
+ munmap(p->zbuf1, p->zbufsize);
+ if (p->zbuf2 != MAP_FAILED && p->zbuf2 != NULL)
+ munmap(p->zbuf2, p->zbufsize);
+ } else
+#endif
+ if (p->buffer != NULL)
+ free(p->buffer);
if (p->dlt_list != NULL)
free(p->dlt_list);
free(p);
Index: pcap-int.h
===================================================================
RCS file: /usr/ncvs/src/contrib/libpcap/pcap-int.h,v
retrieving revision 1.13
diff -u -r1.13 pcap-int.h
--- pcap-int.h 16 Oct 2007 02:07:55 -0000 1.13
+++ pcap-int.h 16 Mar 2008 17:33:32 -0000
@@ -167,12 +167,40 @@
struct pcap_md md;
/*
- * Read buffer.
+ * Read buffer -- for file descriptor read buffer model.
*/
int bufsize;
u_char *buffer;
u_char *bp;
int cc;
+ int to_ms;
+
+ /*
+ * XXXRW: Exactly how to handle ifdefs, etc, is not something I've
+ * worked out yet. Presumably we need to add a configure check for
+ * zero-copy BPF.
+ *
+ * Zero-copy read buffer -- for zero-copy BPF. 'buffer' above will
+ * alternative between these two actual mmap'd buffers as required.
+ * As there is a header on the front size of the mmap'd buffer, only
+ * some of the buffer is exposed to libpcap as a whole via bufsize;
+ * zbufsize is the true size. zbuffer tracks the current zbuf
+ * assocated with buffer so that it can be used to decide which the
+ * next buffer to read will be.
+ */
+ u_char *zbuf1, *zbuf2, *zbuffer;
+ u_int zbufsize;
+ u_int timeout;
+ u_int zerocopy;
+ u_int interrupted;
+ struct timespec firstsel;
+
+ /*
+ * If there's currently a buffer being actively processed, then it is
+ * referenced here; 'buffer' is also pointed at it, but offset by the
+ * size of the header.
+ */
+ struct bpf_zbuf_header *bzh;
/*
* Place holder for pcap_next().
Index: pcap.c
===================================================================
RCS file: /usr/ncvs/src/contrib/libpcap/pcap.c,v
retrieving revision 1.1.1.12
diff -u -r1.1.1.12 pcap.c
--- pcap.c 16 Oct 2007 02:01:45 -0000 1.1.1.12
+++ pcap.c 16 Mar 2008 17:14:49 -0000
@@ -44,6 +44,7 @@
#include <pcap-stdinc.h>
#else /* WIN32 */
#include <sys/types.h>
+#include <sys/mman.h>
#endif /* WIN32 */
#include <stdio.h>
@@ -738,6 +739,24 @@
void
pcap_close_common(pcap_t *p)
{
+#ifdef BIOCSETBUFMODE
+ /*
+ * Check to see if this pcap instance was using the zerocopy buffer
+ * mode. If it was, delete the mappings. Note that p->buffer
+ * gets initialized to one of the mmaped regions in this case, so
+ * do not try and free it directly.
+ *
+ * If the regular buffer mode was selected, then it is safe to free
+ * this memory.
+ */
+ if (p->zerocopy) {
+ if (p->zbuf1 != MAP_FAILED && p->zbuf1 != NULL)
+ munmap(p->zbuf1, p->zbufsize);
+ if (p->zbuf2 != MAP_FAILED && p->zbuf2 != NULL)
+ munmap(p->zbuf2, p->zbufsize);
+ p->buffer = NULL;
+ } else
+#endif
if (p->buffer != NULL)
free(p->buffer);
#if !defined(WIN32) && !defined(MSDOS)
-
This is the tcpdump-workers list.
Visit https://cod.sandelman.ca/ to unsubscribe.