hello,
following the discussion on the implementation of the memory mapped
access for linux, I rework my previous patch to avoid abusing the
current API. It now try to use a default ring size of 4M, reducing it's
size if the allocation of the ring fails.
I did not add any new pcap function to set the ring size, because I
realized that this feature could be useful even for platforms using bpf,
but the use of a separate function to set the ring size could rise some
portability issues (with bpf the buffer size can't be changed after the
socket is bound to an interface).
I think that a good solution could be to add a new function to open live
capture with 'extended' options like the capture ring size (as suggested
earlier by Gianluca), while the 'standard' pcap_open_live will use some
reasonable default.
Again, I appreciate any comments/suggestions on said subject.
ciao,
Paolo
--------------------------------------------------------------------
CONFIDENTIALITY NOTICE
This message and its attachments are addressed solely to the persons above and
may contain confidential information. If you have received the message in
error, be informed that any use of the content hereof is prohibited. Please
return it immediately to the sender and delete the message. Should you have any
questions, please contact us by replying to [EMAIL PROTECTED]
Thank you
www.telecomitalia.it
--------------------------------------------------------------------
Index: pcap-linux.c
===================================================================
RCS file: /tcpdump/master/libpcap/pcap-linux.c,v
retrieving revision 1.131
diff -u -p -r1.131 pcap-linux.c
--- pcap-linux.c 18 Nov 2007 04:37:27 -0000 1.131
+++ pcap-linux.c 6 Dec 2007 09:15:07 -0000
@@ -23,6 +23,13 @@
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Modifications: Added PACKET_MMAP support
+ * Paolo Abeni <[EMAIL PROTECTED]>
+ *
+ * based on previous works of:
+ * Simon Patarin <[EMAIL PROTECTED]>
+ * Phil Wood <[EMAIL PROTECTED]>
*/
#ifndef lint
@@ -95,7 +102,7 @@ static const char rcsid[] _U_ =
#ifdef PCAP_SUPPORT_BT
#include "pcap-bt-linux.h"
#endif
-
+
#ifdef SITA
#include "pcap-sita.h"
#endif
@@ -108,10 +115,12 @@ static const char rcsid[] _U_ =
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <sys/utsname.h>
+#include <sys/mman.h>
#include <net/if.h>
#include <netinet/in.h>
#include <linux/if_ether.h>
#include <net/if_arp.h>
+#include <poll.h>
/*
* If PF_PACKET is defined, we can use {SOCK_RAW,SOCK_DGRAM}/PF_PACKET
@@ -154,6 +163,14 @@ static const char rcsid[] _U_ =
# ifdef PACKET_HOST
# define HAVE_PF_PACKET_SOCKETS
# endif /* PACKET_HOST */
+
+
+ /* check for memory mapped access avaibility. We assume every needed
+ * struct is defined if the macro TPACKET_HDRLEN is defined, because it
+ * uses many ring related structs and macros */
+# ifdef TPACKET_HDRLEN
+# define HAVE_PACKET_RING
+# endif /* TPACKET_HDRLEN */
#endif /* PF_PACKET */
#ifdef SO_ATTACH_FILTER
@@ -196,12 +213,18 @@ typedef int socklen_t;
*/
#define BIGGER_THAN_ALL_MTUS (64*1024)
+/* all the blocks pointers must be contained in a single, kmalloc allocated
+ * buffer. kmalloc is limited to 128K bytes buffer */
+#define MAX_BLOCK_NR (128*1024*1024/sizeof(void*))
+
/*
* Prototypes for internal functions
*/
static void map_arphrd_to_dlt(pcap_t *, int, int);
+static short int map_packet_type_to_sll_type(short int);
static int live_open_old(pcap_t *, const char *, int, int, char *);
static int live_open_new(pcap_t *, const char *, int, int, char *);
+static int live_open_mmap(pcap_t *, char *);
static int pcap_read_linux(pcap_t *, int, pcap_handler, u_char *);
static int pcap_read_packet(pcap_t *, pcap_handler, u_char *);
static int pcap_inject_linux(pcap_t *, const void *, size_t);
@@ -209,6 +232,11 @@ static int pcap_stats_linux(pcap_t *, st
static int pcap_setfilter_linux(pcap_t *, struct bpf_program *);
static int pcap_setdirection_linux(pcap_t *, pcap_direction_t);
static void pcap_close_linux(pcap_t *);
+static void pcap_close_linux_mmap(pcap_t *);
+
+#ifdef HAVE_PACKET_RING
+static int pcap_read_linux_mmap(pcap_t *, int, pcap_handler , u_char *);
+#endif
/*
* Wrap some ioctl calls
@@ -294,6 +322,22 @@ pcap_open_live(const char *device, int s
handle->snapshot = snaplen;
handle->md.timeout = to_ms;
+ handle->inject_op = pcap_inject_linux;
+ handle->setfilter_op = pcap_setfilter_linux;
+ handle->setdirection_op = pcap_setdirection_linux;
+ handle->set_datalink_op = NULL; /* can't change data link type */
+ handle->getnonblock_op = pcap_getnonblock_fd;
+ handle->setnonblock_op = pcap_setnonblock_fd;
+ handle->close_op = pcap_close_linux;
+
+#ifdef SITA
+ handle->read_op = pcap_read_acn;
+ handle->stats_op = pcap_stats_acn;
+#else
+ handle->read_op = pcap_read_linux;
+ handle->stats_op = pcap_stats_linux;
+#endif
+
/*
* NULL and "any" are special devices which give us the hint to
* monitor all devices.
@@ -334,8 +378,11 @@ pcap_open_live(const char *device, int s
handle->fd = live_open_ok;
handle->bufsize = handle->snapshot;
#else
- if ((err = live_open_new(handle, device, promisc, to_ms, ebuf)) == 1)
+ if ((err = live_open_new(handle, device, promisc, to_ms, ebuf)) == 1) {
live_open_ok = 1;
+ if (live_open_mmap(handle, ebuf) == 1)
+ return handle;
+ }
else if (err == 0) {
/* Non-fatal error; try old way */
if (live_open_old(handle, device, promisc, to_ms, ebuf))
@@ -457,22 +504,6 @@ pcap_open_live(const char *device, int s
*/
handle->selectable_fd = handle->fd;
- handle->inject_op = pcap_inject_linux;
- handle->setfilter_op = pcap_setfilter_linux;
- handle->setdirection_op = pcap_setdirection_linux;
- handle->set_datalink_op = NULL; /* can't change data link type */
- handle->getnonblock_op = pcap_getnonblock_fd;
- handle->setnonblock_op = pcap_setnonblock_fd;
- handle->close_op = pcap_close_linux;
-
-#ifdef SITA
- handle->read_op = pcap_read_acn;
- handle->stats_op = pcap_stats_acn;
-#else
- handle->read_op = pcap_read_linux;
- handle->stats_op = pcap_stats_linux;
-#endif
-
return handle;
}
@@ -625,42 +656,7 @@ pcap_read_packet(pcap_t *handle, pcap_ha
packet_len += SLL_HDR_LEN;
hdrp = (struct sll_header *)bp;
-
- /*
- * Map the PACKET_ value to a LINUX_SLL_ value; we
- * want the same numerical value to be used in
- * the link-layer header even if the numerical values
- * for the PACKET_ #defines change, so that programs
- * that look at the packet type field will always be
- * able to handle DLT_LINUX_SLL captures.
- */
- switch (from.sll_pkttype) {
-
- case PACKET_HOST:
- hdrp->sll_pkttype = htons(LINUX_SLL_HOST);
- break;
-
- case PACKET_BROADCAST:
- hdrp->sll_pkttype = htons(LINUX_SLL_BROADCAST);
- break;
-
- case PACKET_MULTICAST:
- hdrp->sll_pkttype = htons(LINUX_SLL_MULTICAST);
- break;
-
- case PACKET_OTHERHOST:
- hdrp->sll_pkttype = htons(LINUX_SLL_OTHERHOST);
- break;
-
- case PACKET_OUTGOING:
- hdrp->sll_pkttype = htons(LINUX_SLL_OUTGOING);
- break;
-
- default:
- hdrp->sll_pkttype = -1;
- break;
- }
-
+ hdrp->sll_pkttype = map_packet_type_to_sll_type(from.sll_pkttype);
hdrp->sll_hatype = htons(from.sll_hatype);
hdrp->sll_halen = htons(from.sll_halen);
memcpy(hdrp->sll_addr, from.sll_addr,
@@ -1130,6 +1126,40 @@ pcap_setdirection_linux(pcap_t *handle,
return -1;
}
+
+/*
+ * Map the PACKET_ value to a LINUX_SLL_ value; we
+ * want the same numerical value to be used in
+ * the link-layer header even if the numerical values
+ * for the PACKET_ #defines change, so that programs
+ * that look at the packet type field will always be
+ * able to handle DLT_LINUX_SLL captures.
+ */
+static short int
+map_packet_type_to_sll_type(short int sll_pkttype)
+{
+ switch (sll_pkttype) {
+
+ case PACKET_HOST:
+ return htons(LINUX_SLL_HOST);
+
+ case PACKET_BROADCAST:
+ return htons(LINUX_SLL_BROADCAST);
+
+ case PACKET_MULTICAST:
+ return htons(LINUX_SLL_MULTICAST);
+
+ case PACKET_OTHERHOST:
+ return htons(LINUX_SLL_OTHERHOST);
+
+ case PACKET_OUTGOING:
+ return htons(LINUX_SLL_OUTGOING);
+
+ default:
+ return -1;
+ }
+}
+
/*
* Linux uses the ARP hardware type to identify the type of an
* interface. pcap uses the DLT_xxx constants for this. This
@@ -1653,6 +1683,186 @@ live_open_new(pcap_t *handle, const char
#endif
}
+static int
+live_open_mmap(pcap_t* handle, char* errmsg)
+{
+#ifdef HAVE_PACKET_RING
+ int frames_per_block;
+ struct tpacket_req req;
+
+ /* round the requested frame length to the nearest greater power of 2
+ * This not imposed by the linux API, but semplify the ring navigation */
+ req.tp_frame_size = 32;
+ while (req.tp_frame_size < handle->snapshot)
+ req.tp_frame_size <<= 1;
+
+ /* by default use 4M for the ring buffer. Note that with large snapshot
+ * (say 64K) this lead to few frames in the ring (and a lot of unused
+ * memory). The snap len should be carefully chosen to achive best
+ * performance */
+ req.tp_frame_nr = 4*1024*1024/req.tp_frame_size;
+
+ /* compute the minumum block size that will handle this frame.
+ * The block have to be page size aligned.
+ * The max block size allowed by the kernel is arch-dependend and
+ * it's not explicitly checked here. */
+ req.tp_block_size = getpagesize();
+ while (req.tp_block_size < handle->snapshot)
+ req.tp_block_size <<= 1;
+
+ frames_per_block = req.tp_block_size/req.tp_frame_size;
+ req.tp_block_nr = req.tp_frame_nr/frames_per_block;
+ if (req.tp_block_nr > MAX_BLOCK_NR)
+ req.tp_block_nr = MAX_BLOCK_NR;
+
+ /* ask the kernel to create the ring */
+retry:
+ if (setsockopt(handle->fd, SOL_PACKET, PACKET_RX_RING,
+ (void *) &req, sizeof(req))) {
+ /* try to reduce requested ring size to prevent memory failure */
+ if ((errno == ENOMEM) && (req.tp_block_nr > 1)) {
+ req.tp_frame_nr>>=1;
+ req.tp_block_nr = req.tp_frame_nr/frames_per_block;
+ goto retry;
+ }
+ snprintf(errmsg, PCAP_ERRBUF_SIZE, "can't create rx ring on "
+ "packet socket %d: %d-%s", handle->fd, errno,
+ pcap_strerror(errno));
+ return 0;
+ }
+
+ /* memory map the rx ring */
+ handle->cc = req.tp_block_nr * req.tp_block_size;
+ handle->bp = mmap(0, handle->cc, PROT_READ| PROT_WRITE, MAP_SHARED,
+ handle->fd, 0);
+ if (handle->bp == MAP_FAILED) {
+ snprintf(errmsg, PCAP_ERRBUF_SIZE, "can't mmap rx ring: %d-%s",
+ errno, pcap_strerror(errno));
+
+ /* clear the allocated ring on error*/
+ memset(&req, 0, sizeof(req));
+ setsockopt(handle->fd, SOL_PACKET, PACKET_RX_RING,
+ (void *) &req, sizeof(req));
+ return 0;
+ }
+
+ /* override some default and inherit the others field from open_live_new
+ * handle->offset is used to get the current position into the rx ring
+ * handle->cc is used to store the ring size */
+ handle->snapshot &= 0xffff;
+ handle->read_op = pcap_read_linux_mmap;
+ handle->close_op = pcap_close_linux_mmap;
+ handle->bufsize = req.tp_frame_size;
+ handle->selectable_fd = handle->fd;
+ handle->offset = 0;
+ handle->buffer = 0;
+ return 1;
+#else /* HAVE_PACKET_RING */
+ return 0;
+#endif /* HAVE_PACKET_RING */
+}
+
+#ifdef HAVE_PACKET_RING
+static int
+pcap_read_linux_mmap(pcap_t *handle, int max_packets, pcap_handler callback,
+ u_char *user)
+{
+ int pkts = 0;
+
+ /* wait for frames availability.*/
+ if (!(*((unsigned*)&handle->bp[handle->offset]))) {
+ struct pollfd pollinfo;
+ int ret;
+
+ pollinfo.fd = handle->fd;
+ pollinfo.events = POLLIN;
+
+ do {
+ /* poll() requires a negative timeout to wait forever */
+ ret = poll(&pollinfo, 1, (handle->md.timeout > 0)?
+ handle->md.timeout: -1);
+ if ((ret < 0) && (errno != EINTR)) {
+ snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
+ "can't poll on packet socket fd %d: %d-%s",
+ handle->fd, errno, pcap_strerror(errno));
+ return -1;
+ }
+ /* check for break loop condition on interrupted syscall*/
+ if (handle->break_loop)
+ return -2;
+ } while (ret < 0);
+ }
+
+ while ((pkts < max_packets) || (max_packets <0)) {
+ struct sockaddr_ll *sll;
+ struct pcap_pkthdr pcaphdr;
+ unsigned char *bp;
+ struct tpacket_hdr* thdr = (struct tpacket_hdr*)&handle->bp[handle->offset];
+ if (thdr->tp_status == TP_STATUS_KERNEL)
+ break;
+
+ /* perform sanity check on internal offset. */
+ if (thdr->tp_mac >= handle->bufsize)
+ return -1;
+
+ /* run filter on received packet */
+ bp = (unsigned char*)thdr + thdr->tp_mac;
+ if (!handle->md.use_bpf && handle->fcode.bf_insns &&
+ (bpf_filter(handle->fcode.bf_insns, bp,
+ thdr->tp_len, thdr->tp_snaplen) == 0))
+ goto skip;
+
+ /* check direction and interface index */
+ sll = (void*)thdr + TPACKET_ALIGN(sizeof(*thdr));
+ if ((sll->sll_ifindex == handle->md.lo_ifindex) &&
+ (sll->sll_pkttype == PACKET_OUTGOING))
+ goto skip;
+
+ /* get required packet info from ring header */
+ pcaphdr.ts.tv_sec = thdr->tp_sec;
+ pcaphdr.ts.tv_usec = thdr->tp_usec;
+ pcaphdr.caplen = thdr->tp_snaplen;
+ pcaphdr.len = thdr->tp_len;
+
+ /* if required build in place the sll header*/
+ if (handle->md.cooked) {
+ struct sll_header *hdrp = (struct sll_header *)((char *)bp - sizeof(struct sll_header));
+
+ hdrp->sll_pkttype = map_packet_type_to_sll_type(
+ sll->sll_pkttype);
+ hdrp->sll_hatype = htons(sll->sll_hatype);
+ hdrp->sll_halen = htons(sll->sll_halen);
+ memcpy(hdrp->sll_addr, sll->sll_addr, SLL_ADDRLEN);
+ hdrp->sll_protocol = sll->sll_protocol;
+
+ /* update packet len */
+ pcaphdr.caplen += SLL_HDR_LEN;
+ pcaphdr.len += SLL_HDR_LEN;
+ }
+
+ /* pass the packet to the user */
+ pkts++;
+ callback(user, &pcaphdr, bp);
+ handle->md.packets_read++;
+
+skip:
+ /* next packet*/
+ thdr->tp_status = TP_STATUS_KERNEL;
+ handle->offset += handle->bufsize;
+ if (handle->offset >= handle->cc)
+ handle->offset = 0;
+
+ /* check for break loop condition*/
+ if (handle->break_loop) {
+ handle->break_loop = 0;
+ return -2;
+ }
+ }
+ return pkts;
+}
+#endif /* HAVE_PACKET_RING */
+
+
#ifdef HAVE_PF_PACKET_SOCKETS
/*
* Return the index of the given device name. Fill ebuf and return
@@ -1826,6 +2036,11 @@ static void pcap_close_linux( pcap_t *ha
#endif /* SITA */
}
+static void pcap_close_linux_mmap( pcap_t *handle )
+{
+ munmap(handle->bp, handle->cc);
+ pcap_close_linux(handle);
+}
/*
* Try to open a packet socket using the old kernel interface.
* Returns 0 on failure.
-
This is the tcpdump-workers list.
Visit https://cod.sandelman.ca/ to unsubscribe.