All,
I have included a patch for review. This adds zero-copy bpf support
to libpcap. It should be noted that I've tried to incorporate all
the feedback that I recieved after the previous submission.
There was one un-related fix in this patch:
Index: configure.in
===================================================================
RCS file: /tcpdump/master/libpcap/configure.in,v
retrieving revision 1.154
diff -u -r1.154 configure.in
@@ -37,7 +62,7 @@
AC_CHECK_HEADERS(net/pfvar.h, , , [#include <sys/types.h>
#include <sys/socket.h>
#include <net/if.h>])
-if test "$ac_cv_header_net_pfvar_h" == yes; then
+if test "$ac_cv_header_net_pfvar_h" = yes; then
#
# Check for various PF actions.
#
This should fix the check for pf.
Index: configure.in
===================================================================
RCS file: /tcpdump/master/libpcap/configure.in,v
retrieving revision 1.154
diff -u -r1.154 configure.in
--- configure.in 6 Aug 2008 08:29:07 -0000 1.154
+++ configure.in 12 Sep 2008 12:40:36 -0000
@@ -21,6 +21,31 @@
AC_LBL_CHECK_TYPE(u_int32_t, u_int)
AC_LBL_CHECK_TYPE(u_int64_t, unsigned long long)
+dnl
+dnl Check for zero-copy bpf support.
+dnl
+AC_CHECK_HEADERS(net/bpf.h, , , [#include <sys/types.h>
+#include <sys/socket.h>
+#include <net/if.h>])
+if test "$ac_cv_header_net_bpf_h" = "yes"; then
+ #
+ # Check for various PF actions.
+ #
+ AC_MSG_CHECKING(whether net/bpf.h supports zerocopy bpf)
+ AC_TRY_COMPILE(
+ [#include <sys/socket.h>
+ #include <sys/ioctl.h>
+ #include <net/if.h>
+ #include <net/bpf.h>],
+ [return (BIOCROTZBUF + BPF_BUFMODE_ZBUF);],
+ [
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_ZEROCOPY_BPF, 1,
+ [define if net/bpf.h supports zerocopy bpf])
+ ],
+ AC_MSG_RESULT(no))
+fi
+
#
# Try to arrange for large file support.
#
@@ -37,7 +62,7 @@
AC_CHECK_HEADERS(net/pfvar.h, , , [#include <sys/types.h>
#include <sys/socket.h>
#include <net/if.h>])
-if test "$ac_cv_header_net_pfvar_h" == yes; then
+if test "$ac_cv_header_net_pfvar_h" = yes; then
#
# Check for various PF actions.
#
Index: pcap-bpf.c
===================================================================
RCS file: /tcpdump/master/libpcap/pcap-bpf.c,v
retrieving revision 1.111
diff -u -r1.111 pcap-bpf.c
--- pcap-bpf.c 1 Jul 2008 08:02:33 -0000 1.111
+++ pcap-bpf.c 12 Sep 2008 12:40:36 -0000
@@ -28,6 +28,9 @@
#endif
#include <sys/param.h> /* optionally get BSD define */
+#ifdef HAVE_ZEROCOPY_BPF
+#include <sys/mman.h>
+#endif
#include <sys/time.h>
#include <sys/timeb.h>
#include <sys/socket.h>
@@ -35,6 +38,10 @@
#include <sys/ioctl.h>
#include <sys/utsname.h>
+#ifdef HAVE_ZEROCOPY_BPF
+#include <machine/atomic.h>
+#endif
+
#include <net/if.h>
#ifdef _AIX
@@ -158,6 +165,227 @@
static int pcap_setdirection_bpf(pcap_t *, pcap_direction_t);
static int pcap_set_datalink_bpf(pcap_t *p, int dlt);
+#ifdef HAVE_ZEROCOPY_BPF
+/*
+ * For zerocopy bpf, we need to override the setnonblock/getnonblock routines
+ * so we don't call select(2) if the pcap handle is in non-blocking mode. We
+ * preserve the timeout supplied by pcap_open functions to make sure it
+ * does not get clobbered if the pcap handle moves between blocking and non-
+ * blocking mode.
+ */
+static int
+pcap_getnonblock_zbuf(pcap_t *p, char *errbuf)
+{
+
+ /*
+ * Use a negative value for the timeout to represent that the
+ * pcap handle is in non-blocking mode.
+ */
+ return (p->md.timeout < 0);
+}
+
+static int
+pcap_setnonblock_zbuf(pcap_t *p, int nonblock, char *errbuf)
+{
+
+ /*
+ * Map each value to the corresponding 2's complement, to
+ * preserve the timeout value provided with pcap_set_timeout.
+ * (from pcap-linux.c).
+ */
+ if (nonblock) {
+ if (p->md.timeout > 0)
+ p->md.timeout = p->md.timeout * -1 - 1;
+ } else
+ if (p->md.timeout < 0)
+ p->md.timeout = (p->md.timeout + 1) * -1;
+ return (0);
+}
+
+/*
+ * Zero-copy specific close method. Un-map the shred buffers then call
+ * pcap_close_common.
+ */
+static void
+pcap_close_zbuf(pcap_t *p)
+{
+
+ /*
+ * Check to see if this pcap instance was using the zerocopy buffer
+ * mode. If it was, delete the mappings. Note that p->buffer
+ * gets initialized to one of the mmaped regions in this case, so
+ * do not try and free it directly.
+ *
+ * If the regular buffer mode was selected, then it is safe to free
+ * this memory.
+ */
+ if (p->md.zerocopy == 0) {
+ pcap_cleanup_live_common(p);
+ return;
+ }
+ if (p->md.zbuf1 != MAP_FAILED && p->md.zbuf1 != NULL)
+ (void) munmap(p->md.zbuf1, p->md.zbufsize);
+ if (p->md.zbuf2 != MAP_FAILED && p->md.zbuf2 != NULL)
+ (void) munmap(p->md.zbuf2, p->md.zbufsize);
+ p->buffer = NULL;
+ pcap_cleanup_live_common(p);
+}
+
+/*
+ * Zero-copy BPF buffer routines to check for and acknowledge BPF data in
+ * shared memory buffers.
+ *
+ * pcap_next_zbuf_shm(): Check for a newly available shared memory buffer,
+ * and set up p->buffer and cc to reflect one if available. Notice that if
+ * there was no prior buffer, we select zbuf1 as this will be the first
+ * buffer filled for a fresh BPF session.
+ */
+static int
+pcap_next_zbuf_shm(pcap_t *p, int *cc)
+{
+ struct bpf_zbuf_header *bzh;
+
+ if (p->md.zbuffer == p->md.zbuf2 || p->md.zbuffer == NULL) {
+ bzh = (struct bpf_zbuf_header *)p->md.zbuf1;
+ if (bzh->bzh_user_gen !=
+ atomic_load_acq_int(&bzh->bzh_kernel_gen)) {
+ p->md.bzh = bzh;
+ p->md.zbuffer = (u_char *)p->md.zbuf1;
+ p->buffer = p->md.zbuffer + sizeof(*bzh);
+ *cc = bzh->bzh_kernel_len;
+ return (1);
+ }
+ } else if (p->md.zbuffer == p->md.zbuf1) {
+ bzh = (struct bpf_zbuf_header *)p->md.zbuf2;
+ if (bzh->bzh_user_gen !=
+ atomic_load_acq_int(&bzh->bzh_kernel_gen)) {
+ p->md.bzh = bzh;
+ p->md.zbuffer = (u_char *)p->md.zbuf2;
+ p->buffer = p->md.zbuffer + sizeof(*bzh);
+ *cc = bzh->bzh_kernel_len;
+ return (1);
+ }
+ }
+ *cc = 0;
+ return (0);
+}
+
+/*
+ * pcap_next_zbuf() -- Similar to pcap_next_zbuf_shm(), except wait using
+ * select() for data or a timeout, and possibly force rotation of the buffer
+ * in the event we time out or are in immediate mode. Invoke the shared
+ * memory check before doing system calls in order to avoid doing avoidable
+ * work.
+ */
+static int
+pcap_next_zbuf(pcap_t *p, int *cc)
+{
+ struct bpf_zbuf bz;
+ struct timeval tv;
+ struct timespec cur;
+ fd_set r_set;
+ int data, r;
+ int expire, tmout;
+
+#define TSTOMILLI(ts) (((ts)->tv_sec * 1000) + ((ts)->tv_nsec / 1000000))
+ /*
+ * Start out by seeing whether anything is waiting by checking the
+ * next shared memory buffer for data.
+ */
+ data = pcap_next_zbuf_shm(p, cc);
+ if (data)
+ return (data);
+ /*
+ * If a previous sleep was interrupted due to signal delivery, make
+ * sure that the timeout gets adjusted accordingly. This requires
+ * that we analyze when the timeout should be been expired, and
+ * subtract the current time from that. If after this operation,
+ * our timeout is less then or equal to zero, handle it like a
+ * regular timeout.
+ */
+ tmout = p->md.timeout;
+ if (tmout)
+ (void) clock_gettime(CLOCK_MONOTONIC, &cur);
+ if (p->md.interrupted && p->md.timeout) {
+ expire = TSTOMILLI(&p->md.firstsel) + p->md.timeout;
+ tmout = expire - TSTOMILLI(&cur);
+#undef TSTOMILLI
+ if (tmout <= 0) {
+ p->md.interrupted = 0;
+ data = pcap_next_zbuf_shm(p, cc);
+ if (data)
+ return (data);
+ if (ioctl(p->fd, BIOCROTZBUF, &bz) < 0) {
+ (void) snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
+ "BIOCROTZBUF: %s", strerror(errno));
+ return (-1);
+ }
+ return (pcap_next_zbuf_shm(p, cc));
+ }
+ }
+ /*
+ * No data in the buffer, so must use select() to wait for data or
+ * the next timeout. Note that we only call select if the handle
+ * is in blocking mode.
+ */
+ if (p->md.timeout >= 0) {
+ FD_ZERO(&r_set);
+ FD_SET(p->fd, &r_set);
+ if (tmout != 0) {
+ tv.tv_sec = tmout / 1000;
+ tv.tv_usec = (tmout * 1000) % 1000000;
+ }
+ r = select(p->fd + 1, &r_set, NULL, NULL,
+ p->md.timeout != 0 ? &tv : NULL);
+ if (r < 0 && errno == EINTR) {
+ if (!p->md.interrupted && p->md.timeout) {
+ p->md.interrupted = 1;
+ p->md.firstsel = cur;
+ }
+ return (0);
+ } else if (r < 0) {
+ (void) snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
+ "select: %s", strerror(errno));
+ return (-1);
+ }
+ }
+ p->md.interrupted = 0;
+ /*
+ * Check again for data, which may exist now that we've either been
+ * woken up as a result of data or timed out. Try the "there's data"
+ * case first since it doesn't require a system call.
+ */
+ data = pcap_next_zbuf_shm(p, cc);
+ if (data)
+ return (data);
+ /*
+ * Try forcing a buffer rotation to dislodge timed out or immediate
+ * data.
+ */
+ if (ioctl(p->fd, BIOCROTZBUF, &bz) < 0) {
+ (void) snprintf(p->errbuf, PCAP_ERRBUF_SIZE,
+ "BIOCROTZBUF: %s", strerror(errno));
+ return (-1);
+ }
+ return (pcap_next_zbuf_shm(p, cc));
+}
+
+/*
+ * Notify kernel that we are done with the buffer. We don't reset zbuffer so
+ * that we know which buffer to use next time around.
+ */
+static int
+pcap_ack_zbuf(pcap_t *p)
+{
+
+ atomic_store_rel_int(&p->md.bzh->bzh_user_gen,
+ p->md.bzh->bzh_kernel_gen);
+ p->md.bzh = NULL;
+ p->buffer = NULL;
+ return (0);
+}
+#endif
+
pcap_t *
pcap_create(const char *device, char *ebuf)
{
@@ -173,6 +401,11 @@
return (NULL);
p->activate_op = pcap_activate_bpf;
+#ifdef HAVE_ZEROCOPY_BPF
+ p->cleanup_op = pcap_close_zbuf;
+ p->setnonblock_op = pcap_setnonblock_zbuf;
+ p->getnonblock_op = pcap_getnonblock_zbuf;
+#endif
p->can_set_rfmon_op = pcap_can_set_rfmon_bpf;
return (p);
}
@@ -507,6 +740,9 @@
#ifdef PCAP_FDDIPAD
register int pad;
#endif
+#ifdef HAVE_ZEROCOPY_BPF
+ int i;
+#endif
again:
/*
@@ -523,6 +759,25 @@
}
cc = p->cc;
if (p->cc == 0) {
+ /*
+ * When reading without zero-copy from a file descriptor, we
+ * use a single buffer and return a length of data in the
+ * buffer. With zero-copy, we update the p->buffer pointer
+ * to point at whatever underlying buffer contains the next
+ * data and update cc to reflect the data found in the
+ * buffer.
+ */
+#ifdef HAVE_ZEROCOPY_BPF
+ if (p->md.zerocopy) {
+ if (p->buffer != NULL)
+ pcap_ack_zbuf(p);
+ i = pcap_next_zbuf(p, &cc);
+ if (i == 0)
+ goto again;
+ if (i < 0)
+ return (-1);
+ } else
+#endif
cc = read(p->fd, (char *)p->buffer, p->bufsize);
if (cc < 0) {
/* Don't choke when we get ptraced */
@@ -954,6 +1209,18 @@
p->md.must_clear = 0;
}
+#ifdef HAVE_ZEROCOPY_BPF
+ /*
+ * In zero-copy mode, p->buffer is just a pointer into one of the two
+ * memory-mapped buffers, so no need to free it.
+ */
+ if (p->md.zerocopy) {
+ if (p->md.zbuf1 != MAP_FAILED && p->md.zbuf1 != NULL)
+ munmap(p->md.zbuf1, p->md.zbufsize);
+ if (p->md.zbuf2 != MAP_FAILED && p->md.zbuf2 != NULL)
+ munmap(p->md.zbuf2, p->md.zbufsize);
+ }
+#endif
if (p->md.device != NULL) {
free(p->md.device);
p->md.device = NULL;
@@ -1073,6 +1340,10 @@
struct bpf_program total_prog;
struct utsname osinfo;
int have_osinfo = 0;
+#ifdef HAVE_ZEROCOPY_BPF
+ struct bpf_zbuf bz;
+ u_int bufmode, zbufmax;
+#endif
fd = bpf_open(p);
if (fd < 0) {
@@ -1189,7 +1460,63 @@
}
}
#endif /* __APPLE__ */
-
+#ifdef HAVE_ZEROCOPY_BPF
+ /*
+ * If the BPF extension to set buffer mode is present, try setting
+ * the mode to zero-copy. If that fails, use regular buffering. If
+ * it succeeds but other setup fails, return an error to the user.
+ */
+ bufmode = BPF_BUFMODE_ZBUF;
+ if (ioctl(fd, BIOCSETBUFMODE, (caddr_t)&bufmode) == 0) {
+ p->md.zerocopy = 1;
+ /*
+ * How to pick a buffer size: first, query the maximum buffer
+ * size supported by zero-copy. This also lets us quickly
+ * determine whether the kernel generally supports zero-copy.
+ * Then, query the default buffer size, which reflects kernel
+ * policy for a desired default. Round to the nearest page
+ * size.
+ */
+ if (ioctl(fd, BIOCGETZMAX, (caddr_t)&zbufmax) < 0) {
+ snprintf(p->errbuf, PCAP_ERRBUF_SIZE, "BIOCGETZMAX: %s",
+ pcap_strerror(errno));
+ goto bad;
+ }
+ if ((ioctl(fd, BIOCGBLEN, (caddr_t)&v) < 0) || v < 32768)
+ v = 32768;
+#ifndef roundup
+#define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) /* to any y */
+#endif
+ p->md.zbufsize = roundup(v, getpagesize());
+ if (p->md.zbufsize > zbufmax)
+ p->md.zbufsize = zbufmax;
+ p->md.zbuf1 = mmap(NULL, p->md.zbufsize, PROT_READ | PROT_WRITE,
+ MAP_ANON, -1, 0);
+ p->md.zbuf2 = mmap(NULL, p->md.zbufsize, PROT_READ | PROT_WRITE,
+ MAP_ANON, -1, 0);
+ if (p->md.zbuf1 == MAP_FAILED || p->md.zbuf2 == MAP_FAILED) {
+ snprintf(p->errbuf, PCAP_ERRBUF_SIZE, "mmap: %s",
+ pcap_strerror(errno));
+ goto bad;
+ }
+ bzero(&bz, sizeof(bz));
+ bz.bz_bufa = p->md.zbuf1;
+ bz.bz_bufb = p->md.zbuf2;
+ bz.bz_buflen = p->md.zbufsize;
+ if (ioctl(fd, BIOCSETZBUF, (caddr_t)&bz) < 0) {
+ snprintf(p->errbuf, PCAP_ERRBUF_SIZE, "BIOCSETZBUF: %s",
+ pcap_strerror(errno));
+ goto bad;
+ }
+ (void)strncpy(ifr.ifr_name, p->opt.source, sizeof(ifr.ifr_name));
+ if (ioctl(fd, BIOCSETIF, (caddr_t)&ifr) < 0) {
+ snprintf(p->errbuf, PCAP_ERRBUF_SIZE, "BIOCSETIF: %s: %s",
+ p->opt.source, pcap_strerror(errno));
+ goto bad;
+ }
+ v = p->md.zbufsize - sizeof(struct bpf_zbuf_header);
+ } else
+#endif
/*
* Set the buffer size.
*/
@@ -1508,7 +1835,11 @@
}
#endif
/* set timeout */
- if (p->md.timeout != 0) {
+#ifdef HAVE_ZEROCOPY_BPF
+ if (p->md.timeout != 0 && !p->md.zerocopy) {
+#else
+ if (p->md.timeout) {
+#endif /* HAVE_ZEROCOPY_BPF
/*
* XXX - is this seconds/nanoseconds in AIX?
* (Treating it as such doesn't fix the timeout
@@ -1599,6 +1930,9 @@
goto bad;
}
p->bufsize = v;
+#ifdef HAVE_ZEROCOPY_BPF
+ if (!p->md.zerocopy) {
+#endif
p->buffer = (u_char *)malloc(p->bufsize);
if (p->buffer == NULL) {
snprintf(p->errbuf, PCAP_ERRBUF_SIZE, "malloc: %s",
@@ -1611,6 +1945,9 @@
* problems we have experienced from AIX BPF. */
memset(p->buffer, 0x0, p->bufsize);
#endif
+#ifdef HAVE_ZEROCOPY_BPF
+ }
+#endif
/*
* If there's no filter program installed, there's
Index: pcap-int.h
===================================================================
RCS file: /tcpdump/master/libpcap/pcap-int.h,v
retrieving revision 1.93
diff -u -r1.93 pcap-int.h
--- pcap-int.h 6 Aug 2008 07:49:19 -0000 1.93
+++ pcap-int.h 12 Sep 2008 12:40:36 -0000
@@ -152,6 +152,28 @@
* Same as in linux above, introduce
* generally? */
#endif /* HAVE_DAG_API */
+#ifdef HAVE_ZEROCOPY_BPF
+ /*
+ * Zero-copy read buffer -- for zero-copy BPF. 'buffer' above will
+ * alternative between these two actual mmap'd buffers as required.
+ * As there is a header on the front size of the mmap'd buffer, only
+ * some of the buffer is exposed to libpcap as a whole via bufsize;
+ * zbufsize is the true size. zbuffer tracks the current zbuf
+ * assocated with buffer so that it can be used to decide which the
+ * next buffer to read will be.
+ */
+ u_char *zbuf1, *zbuf2, *zbuffer;
+ u_int zbufsize;
+ u_int zerocopy;
+ u_int interrupted;
+ struct timespec firstsel;
+ /*
+ * If there's currently a buffer being actively processed, then it is
+ * referenced here; 'buffer' is also pointed at it, but offset by the
+ * size of the header.
+ */
+ struct bpf_zbuf_header *bzh;
+#endif /* HAVE_ZEROCOPY_BPF */
};
/*
-
This is the tcpdump-workers list.
Visit https://cod.sandelman.ca/ to unsubscribe.