[tcpdump-workers] [PACTH] enable memory mapped access to ethernet devices (4thversion)

Paolo Abeni Wed, 02 Jan 2008 02:48:17 -0800

hi all,

In this revision I try to handle the issues pinpointed by Guy Harris in
his previous posts.


Specifically:

- the selectable_fd filed should be correctly (btw it should be
correctly set even in the previous version of the patch).

- if ring corruption is found, the read 'method' return with error and
an error message is properly written to errbuf. The function pointers
into the pcap handle are left unmodified, since all pcap functions,
except handle->read_op(), should continue to work and later calls to the
read function will return the same error.

- in the previous version of the patch, the capture snapshot was
incorrectly bound to 0xffff; this don't happen any more. 

thanks again for your reviews and happy new year ...

Paolo


--------------------------------------------------------------------

CONFIDENTIALITY NOTICE

This message and its attachments are addressed solely to the persons above and 
may contain confidential information. If you have received the message in 
error, be informed that any use of the content hereof is prohibited. Please 
return it immediately to the sender and delete the message. Should you have any 
questions, please contact us by replying to [EMAIL PROTECTED]

        Thank you

                                        www.telecomitalia.it

--------------------------------------------------------------------

Index: pcap-linux.c
===================================================================
RCS file: /tcpdump/master/libpcap/pcap-linux.c,v
retrieving revision 1.131
diff -u -p -r1.131 pcap-linux.c
--- pcap-linux.c	18 Nov 2007 04:37:27 -0000	1.131
+++ pcap-linux.c	2 Jan 2008 10:35:49 -0000
@@ -23,6 +23,13 @@
  *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
  *  IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
  *  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ *  Modifications:     Added PACKET_MMAP support
+ *                     Paolo Abeni <[EMAIL PROTECTED]> 
+ *                     
+ *                     based on previous works of:
+ *                     Simon Patarin <[EMAIL PROTECTED]>
+ *                     Phil Wood <[EMAIL PROTECTED]>
  */
 
 #ifndef lint
@@ -95,7 +102,7 @@ static const char rcsid[] _U_ =
 #ifdef PCAP_SUPPORT_BT
 #include "pcap-bt-linux.h"
 #endif
-	  
+
 #ifdef SITA
 #include "pcap-sita.h"
 #endif
@@ -108,10 +115,12 @@ static const char rcsid[] _U_ =
 #include <sys/socket.h>
 #include <sys/ioctl.h>
 #include <sys/utsname.h>
+#include <sys/mman.h>
 #include <net/if.h>
 #include <netinet/in.h>
 #include <linux/if_ether.h>
 #include <net/if_arp.h>
+#include <poll.h>
 
 /*
  * If PF_PACKET is defined, we can use {SOCK_RAW,SOCK_DGRAM}/PF_PACKET
@@ -154,6 +163,14 @@ static const char rcsid[] _U_ =
 # ifdef PACKET_HOST
 #  define HAVE_PF_PACKET_SOCKETS
 # endif /* PACKET_HOST */
+
+
+ /* check for memory mapped access avaibility. We assume every needed 
+  * struct is defined if the macro TPACKET_HDRLEN is defined, because it
+  * uses many ring related structs and macros */
+# ifdef TPACKET_HDRLEN
+#  define HAVE_PACKET_RING
+# endif /* TPACKET_HDRLEN */
 #endif /* PF_PACKET */
 
 #ifdef SO_ATTACH_FILTER
@@ -200,8 +217,10 @@ typedef int		socklen_t;
  * Prototypes for internal functions
  */
 static void map_arphrd_to_dlt(pcap_t *, int, int);
+static short int map_packet_type_to_sll_type(short int);
 static int live_open_old(pcap_t *, const char *, int, int, char *);
 static int live_open_new(pcap_t *, const char *, int, int, char *);
+static int live_open_mmap(pcap_t *, char *);
 static int pcap_read_linux(pcap_t *, int, pcap_handler, u_char *);
 static int pcap_read_packet(pcap_t *, pcap_handler, u_char *);
 static int pcap_inject_linux(pcap_t *, const void *, size_t);
@@ -210,6 +229,18 @@ static int pcap_setfilter_linux(pcap_t *
 static int pcap_setdirection_linux(pcap_t *, pcap_direction_t);
 static void pcap_close_linux(pcap_t *);
 
+#ifdef HAVE_PACKET_RING
+#define RING_GET_FRAME(h) (((struct tpacket_hdr**)h->buffer)[handle->offset])
+
+static void destroy_ring(pcap_t *handle);
+static int create_ring(pcap_t* handle, unsigned size, char* errmsg);
+static void pcap_close_linux_mmap(pcap_t *);
+static int pcap_read_linux_mmap(pcap_t *, int, pcap_handler , u_char *);
+static int pcap_setfilter_linux_mmap(pcap_t *, struct bpf_program *);
+static int pcap_setnonblock_mmap(pcap_t *p, int nonblock, char *errbuf);
+static int pcap_getnonblock_mmap(pcap_t *p, char *errbuf);
+#endif
+
 /*
  * Wrap some ioctl calls
  */
@@ -294,6 +325,22 @@ pcap_open_live(const char *device, int s
 	handle->snapshot	= snaplen;
 	handle->md.timeout	= to_ms;
 
+	handle->inject_op = pcap_inject_linux;
+	handle->setfilter_op = pcap_setfilter_linux;
+	handle->setdirection_op = pcap_setdirection_linux;
+	handle->set_datalink_op = NULL;	/* can't change data link type */
+	handle->getnonblock_op = pcap_getnonblock_fd;
+	handle->setnonblock_op = pcap_setnonblock_fd;
+	handle->close_op = pcap_close_linux;
+
+#ifdef SITA
+	handle->read_op = pcap_read_acn;
+	handle->stats_op = pcap_stats_acn;
+#else
+	handle->read_op = pcap_read_linux;
+	handle->stats_op = pcap_stats_linux;
+#endif
+
 	/*
 	 * NULL and "any" are special devices which give us the hint to
 	 * monitor all devices.
@@ -334,8 +381,11 @@ pcap_open_live(const char *device, int s
 	handle->fd = live_open_ok;
 	handle->bufsize = handle->snapshot;
 #else
-	if ((err = live_open_new(handle, device, promisc, to_ms, ebuf)) == 1)
+	if ((err = live_open_new(handle, device, promisc, to_ms, ebuf)) == 1) {
 		live_open_ok = 1;
+		if (live_open_mmap(handle, ebuf) == 1)
+			return handle;
+	}
 	else if (err == 0) {
 		/* Non-fatal error; try old way */
 		if (live_open_old(handle, device, promisc, to_ms, ebuf))
@@ -457,22 +507,6 @@ pcap_open_live(const char *device, int s
 	 */
 	handle->selectable_fd = handle->fd;
 
-	handle->inject_op = pcap_inject_linux;
-	handle->setfilter_op = pcap_setfilter_linux;
-	handle->setdirection_op = pcap_setdirection_linux;
-	handle->set_datalink_op = NULL;	/* can't change data link type */
-	handle->getnonblock_op = pcap_getnonblock_fd;
-	handle->setnonblock_op = pcap_setnonblock_fd;
-	handle->close_op = pcap_close_linux;
-
-#ifdef SITA
-	handle->read_op = pcap_read_acn;
-	handle->stats_op = pcap_stats_acn;
-#else
-	handle->read_op = pcap_read_linux;
-	handle->stats_op = pcap_stats_linux;
-#endif
-
 	return handle;
 }
 
@@ -625,42 +659,7 @@ pcap_read_packet(pcap_t *handle, pcap_ha
 		packet_len += SLL_HDR_LEN;
 
 		hdrp = (struct sll_header *)bp;
-
-		/*
-		 * Map the PACKET_ value to a LINUX_SLL_ value; we
-		 * want the same numerical value to be used in
-		 * the link-layer header even if the numerical values
-		 * for the PACKET_ #defines change, so that programs
-		 * that look at the packet type field will always be
-		 * able to handle DLT_LINUX_SLL captures.
-		 */
-		switch (from.sll_pkttype) {
-
-		case PACKET_HOST:
-			hdrp->sll_pkttype = htons(LINUX_SLL_HOST);
-			break;
-
-		case PACKET_BROADCAST:
-			hdrp->sll_pkttype = htons(LINUX_SLL_BROADCAST);
-			break;
-
-		case PACKET_MULTICAST:
-			hdrp->sll_pkttype = htons(LINUX_SLL_MULTICAST);
-			break;
-
-		case PACKET_OTHERHOST:
-			hdrp->sll_pkttype = htons(LINUX_SLL_OTHERHOST);
-			break;
-
-		case PACKET_OUTGOING:
-			hdrp->sll_pkttype = htons(LINUX_SLL_OUTGOING);
-			break;
-
-		default:
-			hdrp->sll_pkttype = -1;
-			break;
-		}
-
+		hdrp->sll_pkttype = map_packet_type_to_sll_type(from.sll_pkttype);
 		hdrp->sll_hatype = htons(from.sll_hatype);
 		hdrp->sll_halen = htons(from.sll_halen);
 		memcpy(hdrp->sll_addr, from.sll_addr,
@@ -1130,6 +1129,40 @@ pcap_setdirection_linux(pcap_t *handle, 
 	return -1;
 }
 
+
+/*
+ * Map the PACKET_ value to a LINUX_SLL_ value; we
+ * want the same numerical value to be used in
+ * the link-layer header even if the numerical values
+ * for the PACKET_ #defines change, so that programs
+ * that look at the packet type field will always be
+ * able to handle DLT_LINUX_SLL captures.
+ */
+static short int
+map_packet_type_to_sll_type(short int sll_pkttype)
+{
+	switch (sll_pkttype) {
+
+	case PACKET_HOST:
+		return htons(LINUX_SLL_HOST);
+
+	case PACKET_BROADCAST:
+		return htons(LINUX_SLL_BROADCAST);
+
+	case PACKET_MULTICAST:
+		return  htons(LINUX_SLL_MULTICAST);
+
+	case PACKET_OTHERHOST:
+		return htons(LINUX_SLL_OTHERHOST);
+
+	case PACKET_OUTGOING:
+		return htons(LINUX_SLL_OUTGOING);
+
+	default:
+		return -1;
+	}
+}
+
 /*
  *  Linux uses the ARP hardware type to identify the type of an
  *  interface. pcap uses the DLT_xxx constants for this. This
@@ -1653,6 +1686,327 @@ live_open_new(pcap_t *handle, const char
 #endif
 }
 
+static int 
+live_open_mmap(pcap_t* handle, char* errmsg)
+{
+#ifdef HAVE_PACKET_RING
+	/* by default request 4M for the ring buffer */
+	int ret = create_ring(handle, 4*1024*1024, errmsg);
+	if (ret == 0)
+		return ret;
+
+	/* override some default and inherit the others field from open_live_new
+	 * handle->offset is used to get the current position into the rx ring 
+	 * handle->cc is used to store the ring size */
+	handle->read_op = pcap_read_linux_mmap;
+	handle->close_op = pcap_close_linux_mmap;
+	handle->setfilter_op = pcap_setfilter_linux_mmap;
+	handle->setnonblock_op = pcap_setnonblock_mmap;
+	handle->getnonblock_op = pcap_getnonblock_mmap;
+	handle->selectable_fd = handle->fd;
+	return 1;
+#else /* HAVE_PACKET_RING */
+	return 0;
+#endif /* HAVE_PACKET_RING */
+}
+
+#ifdef HAVE_PACKET_RING
+
+static void
+compute_ring_block(int frame_size, unsigned *block_size, unsigned *frames_per_block)
+{
+	/* compute the minumum block size that will handle this frame. 
+	 * The block have to be page size aligned. 
+	 * The max block size allowed by the kernel is arch-dependend and 
+	 * it's not explicitly checked here. */
+	*block_size = getpagesize();
+	while (*block_size < frame_size) 
+		*block_size <<= 1;
+
+	*frames_per_block = *block_size/frame_size;
+}
+
+static int
+create_ring(pcap_t* handle, unsigned size, char* errmsg)
+{
+	unsigned i, j, ringsize, frames_per_block;
+	struct tpacket_req req;
+
+	/* Note that with large snapshot (say 64K) only a few frames 
+	 * will be available in the ring even with pretty large ring size
+	 * (and a lot of memory will be unused). 
+	 * The snap len should be carefully chosen to achive best
+	 * performance */
+	req.tp_frame_size = TPACKET_ALIGN(handle->snapshot+TPACKET_HDRLEN);
+	req.tp_frame_nr = size/req.tp_frame_size;
+	compute_ring_block(req.tp_frame_size, &req.tp_block_size, &frames_per_block);
+	req.tp_block_nr = req.tp_frame_nr / frames_per_block;
+
+	/* req.tp_frame_nr is requested to match frames_per_block*req.tp_block_nr */
+	req.tp_frame_nr = req.tp_block_nr * frames_per_block;
+
+	/* ask the kernel to create the ring */
+retry:
+	if (setsockopt(handle->fd, SOL_PACKET, PACKET_RX_RING,
+					(void *) &req, sizeof(req))) {
+		/* try to reduce requested ring size to prevent memory failure */
+		if ((errno == ENOMEM) && (req.tp_block_nr > 1)) {
+			req.tp_frame_nr >>= 1;
+			req.tp_block_nr = req.tp_frame_nr/frames_per_block;
+			goto retry;
+		}
+		snprintf(errmsg, PCAP_ERRBUF_SIZE, "can't create rx ring on "
+				"packet socket %d: %d-%s", handle->fd, errno, 
+				pcap_strerror(errno));
+		return 0;
+	}
+
+	/* memory map the rx ring */
+	ringsize = req.tp_block_nr * req.tp_block_size;
+	handle->bp = mmap(0, ringsize, PROT_READ| PROT_WRITE, MAP_SHARED, 
+					handle->fd, 0);
+	if (handle->bp == MAP_FAILED) {
+		snprintf(errmsg, PCAP_ERRBUF_SIZE, "can't mmap rx ring: %d-%s",
+			errno, pcap_strerror(errno));
+
+		/* clear the allocated ring on error*/
+		destroy_ring(handle);
+		return 0;
+	}
+
+	/* allocate a ring for each frame header pointer*/
+	handle->cc = req.tp_frame_nr;
+	handle->buffer = malloc(handle->cc * sizeof(struct tpacket_hdr*));
+	if (!handle->buffer) {
+		destroy_ring(handle);
+		return 0;
+	}
+
+	/* fill the header ring with proper frame ptr*/
+	handle->offset = 0;
+	for (i=0; i<req.tp_block_nr; ++i) {
+		u_char *base = &handle->bp[i*req.tp_block_size];
+		for (j=0; j<frames_per_block; ++j, ++handle->offset) {
+			RING_GET_FRAME(handle) = (struct tpacket_hdr*) base;
+			base += req.tp_frame_size;
+		}
+	}
+
+	handle->bufsize = req.tp_frame_size;
+	handle->offset = 0;
+	return 1;
+}
+
+/* free all ring related resources*/
+static void
+destroy_ring(pcap_t *handle)
+{
+	/* tell the kernel to destroy the ring*/
+	struct tpacket_req req;
+	memset(&req, 0, sizeof(req));
+	setsockopt(handle->fd, SOL_PACKET, PACKET_RX_RING,
+				(void *) &req, sizeof(req));
+
+	/* if ring is mapped, unmap it*/
+	if (handle->bp) {
+		/* need to re-compute the ring size */
+		unsigned frames_per_block, block_size;
+		compute_ring_block(handle->bufsize, &block_size, &frames_per_block);
+
+		/* do not perform sanity check here: we can't recover any error */
+		munmap(handle->bp, block_size * handle->cc / frames_per_block);
+		handle->bp = 0;
+	}
+
+	/* if the header ring is allocated, clear it*/
+	if (handle->buffer) {
+		free(handle->buffer);
+		handle->buffer = 0;
+	}
+}
+
+static void
+pcap_close_linux_mmap( pcap_t *handle )
+{
+	destroy_ring(handle);
+	pcap_close_linux(handle);
+}
+
+
+int
+pcap_getnonblock_mmap(pcap_t *p, char *errbuf)
+{
+	/* use negative value of timeout to indicate non blocking ops */
+	return (p->md.timeout<0);
+}
+
+int
+pcap_setnonblock_mmap(pcap_t *p, int nonblock, char *errbuf)
+{
+	/* map each value to the corrisponding 2-complement, to 
+	 * preserve the timeout value provided with pcap_open_live */
+	if (nonblock) {
+		if (p->md.timeout > 0)
+			p->md.timeout = p->md.timeout*-1 - 1;
+	} else 
+		if (p->md.timeout < 0)
+			p->md.timeout = (p->md.timeout+1)*-1;
+	return 0;
+}
+
+static int
+pcap_read_linux_mmap(pcap_t *handle, int max_packets, pcap_handler callback, 
+		u_char *user)
+{
+	int pkts = 0;
+
+	/* wait for frames availability.*/
+	if ((handle->md.timeout >= 0) && !(RING_GET_FRAME(handle)->tp_status)) {
+		struct pollfd pollinfo;
+		int ret;
+
+		pollinfo.fd = handle->fd;
+		pollinfo.events = POLLIN;
+
+		do {
+			/* poll() requires a negative timeout to wait forever */
+			ret = poll(&pollinfo, 1, (handle->md.timeout > 0)?
+			 			handle->md.timeout: -1);
+			if ((ret < 0) && (errno != EINTR)) {
+				snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, 
+					 "can't poll on packet socket fd %d: %d-%s",
+					handle->fd, errno, pcap_strerror(errno));
+				return -1;
+			}
+			/* check for break loop condition on interrupted syscall*/
+			if (handle->break_loop) {
+				handle->break_loop = 0;
+				return -2;
+			}
+		} while (ret < 0);
+	}
+
+	/* negative values of max_packets are used to requires all 
+	 * packets available into the ring */
+	while ((pkts < max_packets) || (max_packets <0)) {
+		int run_bpf;
+		struct sockaddr_ll *sll;
+		struct pcap_pkthdr pcaphdr;
+		unsigned char *bp;
+		struct tpacket_hdr* thdr = RING_GET_FRAME(handle);
+		if (thdr->tp_status == TP_STATUS_KERNEL)
+			break;
+
+		/* perform sanity check on internal offset. */
+		if (thdr->tp_mac+thdr->tp_snaplen > handle->bufsize) {
+			snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, 
+				"corrupted frame on kernel ring mac "
+				"offset %d + caplen %d > frame len %d\n", 
+				thdr->tp_mac, thdr->tp_snaplen, handle->bufsize);
+			return -1;
+		}
+
+		/* run filter on received packet
+		 * If the kernel filtering is enabled we need to run the
+		 * filter until all the frames present into the ring 
+		 * at filter creation time are processed. 
+		 * In such case md.use_bpf is used as a counter for the 
+		 * packet we need to filter.
+		 * Note: alternatively it could be possible to stop applying 
+		 * the filter when the ring became empty, but in can possily
+		 * happen a lot later... */
+		bp = (unsigned char*)thdr + thdr->tp_mac;
+		run_bpf = (!handle->md.use_bpf) || 
+			((handle->md.use_bpf>1) && handle->md.use_bpf--);
+		if (run_bpf && handle->fcode.bf_insns && 
+				(bpf_filter(handle->fcode.bf_insns, bp,
+					thdr->tp_len, thdr->tp_snaplen) == 0))
+			goto skip;
+
+		/* check direction and interface index */
+		sll = (void*)thdr + TPACKET_ALIGN(sizeof(*thdr));
+		if ((sll->sll_ifindex == handle->md.lo_ifindex) &&
+					(sll->sll_pkttype == PACKET_OUTGOING))
+			goto skip;
+
+		/* get required packet info from ring header */
+		pcaphdr.ts.tv_sec = thdr->tp_sec;
+		pcaphdr.ts.tv_usec = thdr->tp_usec;
+		pcaphdr.caplen = thdr->tp_snaplen;
+		pcaphdr.len = thdr->tp_len;
+
+		/* if required build in place the sll header*/
+		if (handle->md.cooked) {
+			struct sll_header *hdrp = (struct sll_header *)((char *)bp - sizeof(struct sll_header));
+
+			hdrp->sll_pkttype = map_packet_type_to_sll_type(
+							sll->sll_pkttype);
+			hdrp->sll_hatype = htons(sll->sll_hatype);
+			hdrp->sll_halen = htons(sll->sll_halen);
+			memcpy(hdrp->sll_addr, sll->sll_addr, SLL_ADDRLEN);
+			hdrp->sll_protocol = sll->sll_protocol;
+
+			/* update packet len */
+			pcaphdr.caplen += SLL_HDR_LEN;
+			pcaphdr.len += SLL_HDR_LEN;
+		}
+
+		/* pass the packet to the user */
+		pkts++;
+		callback(user, &pcaphdr, bp);
+		handle->md.packets_read++;
+
+skip:
+		/* next packet */
+		thdr->tp_status = TP_STATUS_KERNEL;
+		if (++handle->offset >= handle->cc)
+			handle->offset = 0;
+
+		/* check for break loop condition*/
+		if (handle->break_loop) {
+			handle->break_loop = 0;
+			return -2;
+		}
+	}
+	return pkts;
+}
+
+static int 
+pcap_setfilter_linux_mmap(pcap_t *handle, struct bpf_program *filter)
+{
+	int n, offset;
+	int ret = pcap_setfilter_linux(handle, filter);
+	if (ret < 0)
+		return ret;
+
+	/* if the kernel filter is enabled, we need to apply the filter on
+	 * all packets present into the ring. Get an upper bound of their number
+	 */
+	if (!handle->md.use_bpf)
+		return ret;
+
+	/* walk the ring backward and count the free slot */
+	offset = handle->offset;
+	if (--handle->offset < 0)
+		handle->offset = handle->cc - 1;
+	for (n=0; n < handle->cc; ++n) {
+		if (--handle->offset < 0)
+			handle->offset = handle->cc - 1;
+		if (RING_GET_FRAME(handle)->tp_status != TP_STATUS_KERNEL)
+			break;
+	}
+
+	/* be carefull to not change current ring position */
+	handle->offset = offset;
+
+	/* store the number of packets currently present in the ring */
+	handle->md.use_bpf = 1 + (handle->cc - n);
+	return ret;
+}
+
+#endif /* HAVE_PACKET_RING */
+
+
 #ifdef HAVE_PF_PACKET_SOCKETS
 /*
  *  Return the index of the given device name. Fill ebuf and return

-
This is the tcpdump-workers list.
Visit https://cod.sandelman.ca/ to unsubscribe.

[tcpdump-workers] [PACTH] enable memory mapped access to ethernet devices (4thversion)

Reply via email to