hi list,
Following the discussion about the memory mapped access for Linux, I
tried to rework some existing code (originally independently created by
Simon Patarin and Phil Wood) to enable this functionality with the
minimum impact on the current pcap code.
It does not use environment variables to control the memory mapped ring
parameters; instead the requested snap len is used: the low order bytes
are used to select the ring frame size and the high order bytes are used
to select the ring frame number. If the high order bytes is 0, like in
every current libpcap usage, a reasonable default is used.
The poll function is used to implement the timeout in the read method:
must I add some autoconf check for poll() availability ?!? or use
select() instead ?!?
I really appreciated any comments and or review on this matter (even a
'please don't try to push this thing ...' :-)
Thanks,
Paolo
--------------------------------------------------------------------
CONFIDENTIALITY NOTICE
This message and its attachments are addressed solely to the persons above and
may contain confidential information. If you have received the message in
error, be informed that any use of the content hereof is prohibited. Please
return it immediately to the sender and delete the message. Should you have any
questions, please contact us by replying to [EMAIL PROTECTED]
Thank you
www.telecomitalia.it
--------------------------------------------------------------------
Index: pcap-linux.c
===================================================================
RCS file: /tcpdump/master/libpcap/pcap-linux.c,v
retrieving revision 1.131
diff -u -p -r1.131 pcap-linux.c
--- pcap-linux.c 18 Nov 2007 04:37:27 -0000 1.131
+++ pcap-linux.c 5 Dec 2007 10:49:23 -0000
@@ -23,6 +23,13 @@
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Modifications: Added PACKET_MMAP support
+ * Paolo Abeni <[EMAIL PROTECTED]>
+ *
+ * based on previous works of:
+ * Simon Patarin <[EMAIL PROTECTED]>
+ * Phil Wood <[EMAIL PROTECTED]>
*/
#ifndef lint
@@ -95,7 +102,7 @@ static const char rcsid[] _U_ =
#ifdef PCAP_SUPPORT_BT
#include "pcap-bt-linux.h"
#endif
-
+
#ifdef SITA
#include "pcap-sita.h"
#endif
@@ -108,10 +115,12 @@ static const char rcsid[] _U_ =
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <sys/utsname.h>
+#include <sys/mman.h>
#include <net/if.h>
#include <netinet/in.h>
#include <linux/if_ether.h>
#include <net/if_arp.h>
+#include <poll.h>
/*
* If PF_PACKET is defined, we can use {SOCK_RAW,SOCK_DGRAM}/PF_PACKET
@@ -154,6 +163,14 @@ static const char rcsid[] _U_ =
# ifdef PACKET_HOST
# define HAVE_PF_PACKET_SOCKETS
# endif /* PACKET_HOST */
+
+
+ /* check for memory mapped access avaibility. We assume every needed
+ * struct is defined if the macro TPACKET_HDRLEN is defined, because it
+ * uses many ring related structs and macros */
+# ifdef TPACKET_HDRLEN
+# define HAVE_PACKET_RING
+# endif /* TPACKET_HDRLEN */
#endif /* PF_PACKET */
#ifdef SO_ATTACH_FILTER
@@ -196,12 +213,18 @@ typedef int socklen_t;
*/
#define BIGGER_THAN_ALL_MTUS (64*1024)
+/* all the blocks pointers must be contained in a single, kmalloc allocated
+ * buffer. kmalloc is limited to 128K bytes buffer */
+#define MAX_BLOCK_NR (128*1024*1024/sizeof(void*))
+
/*
* Prototypes for internal functions
*/
static void map_arphrd_to_dlt(pcap_t *, int, int);
+static short int map_packet_type_to_sll_type(short int);
static int live_open_old(pcap_t *, const char *, int, int, char *);
static int live_open_new(pcap_t *, const char *, int, int, char *);
+static int live_open_mmap(pcap_t *, char *);
static int pcap_read_linux(pcap_t *, int, pcap_handler, u_char *);
static int pcap_read_packet(pcap_t *, pcap_handler, u_char *);
static int pcap_inject_linux(pcap_t *, const void *, size_t);
@@ -209,6 +232,11 @@ static int pcap_stats_linux(pcap_t *, st
static int pcap_setfilter_linux(pcap_t *, struct bpf_program *);
static int pcap_setdirection_linux(pcap_t *, pcap_direction_t);
static void pcap_close_linux(pcap_t *);
+static void pcap_close_linux_mmap(pcap_t *);
+
+#ifdef HAVE_PACKET_RING
+static int pcap_read_linux_mmap(pcap_t *, int, pcap_handler , u_char *);
+#endif
/*
* Wrap some ioctl calls
@@ -294,6 +322,22 @@ pcap_open_live(const char *device, int s
handle->snapshot = snaplen;
handle->md.timeout = to_ms;
+ handle->inject_op = pcap_inject_linux;
+ handle->setfilter_op = pcap_setfilter_linux;
+ handle->setdirection_op = pcap_setdirection_linux;
+ handle->set_datalink_op = NULL; /* can't change data link type */
+ handle->getnonblock_op = pcap_getnonblock_fd;
+ handle->setnonblock_op = pcap_setnonblock_fd;
+ handle->close_op = pcap_close_linux;
+
+#ifdef SITA
+ handle->read_op = pcap_read_acn;
+ handle->stats_op = pcap_stats_acn;
+#else
+ handle->read_op = pcap_read_linux;
+ handle->stats_op = pcap_stats_linux;
+#endif
+
/*
* NULL and "any" are special devices which give us the hint to
* monitor all devices.
@@ -334,8 +378,11 @@ pcap_open_live(const char *device, int s
handle->fd = live_open_ok;
handle->bufsize = handle->snapshot;
#else
- if ((err = live_open_new(handle, device, promisc, to_ms, ebuf)) == 1)
+ if ((err = live_open_new(handle, device, promisc, to_ms, ebuf)) == 1) {
live_open_ok = 1;
+ if (live_open_mmap(handle, ebuf) == 1)
+ return handle;
+ }
else if (err == 0) {
/* Non-fatal error; try old way */
if (live_open_old(handle, device, promisc, to_ms, ebuf))
@@ -457,22 +504,6 @@ pcap_open_live(const char *device, int s
*/
handle->selectable_fd = handle->fd;
- handle->inject_op = pcap_inject_linux;
- handle->setfilter_op = pcap_setfilter_linux;
- handle->setdirection_op = pcap_setdirection_linux;
- handle->set_datalink_op = NULL; /* can't change data link type */
- handle->getnonblock_op = pcap_getnonblock_fd;
- handle->setnonblock_op = pcap_setnonblock_fd;
- handle->close_op = pcap_close_linux;
-
-#ifdef SITA
- handle->read_op = pcap_read_acn;
- handle->stats_op = pcap_stats_acn;
-#else
- handle->read_op = pcap_read_linux;
- handle->stats_op = pcap_stats_linux;
-#endif
-
return handle;
}
@@ -625,42 +656,7 @@ pcap_read_packet(pcap_t *handle, pcap_ha
packet_len += SLL_HDR_LEN;
hdrp = (struct sll_header *)bp;
-
- /*
- * Map the PACKET_ value to a LINUX_SLL_ value; we
- * want the same numerical value to be used in
- * the link-layer header even if the numerical values
- * for the PACKET_ #defines change, so that programs
- * that look at the packet type field will always be
- * able to handle DLT_LINUX_SLL captures.
- */
- switch (from.sll_pkttype) {
-
- case PACKET_HOST:
- hdrp->sll_pkttype = htons(LINUX_SLL_HOST);
- break;
-
- case PACKET_BROADCAST:
- hdrp->sll_pkttype = htons(LINUX_SLL_BROADCAST);
- break;
-
- case PACKET_MULTICAST:
- hdrp->sll_pkttype = htons(LINUX_SLL_MULTICAST);
- break;
-
- case PACKET_OTHERHOST:
- hdrp->sll_pkttype = htons(LINUX_SLL_OTHERHOST);
- break;
-
- case PACKET_OUTGOING:
- hdrp->sll_pkttype = htons(LINUX_SLL_OUTGOING);
- break;
-
- default:
- hdrp->sll_pkttype = -1;
- break;
- }
-
+ hdrp->sll_pkttype = map_packet_type_to_sll_type(from.sll_pkttype);
hdrp->sll_hatype = htons(from.sll_hatype);
hdrp->sll_halen = htons(from.sll_halen);
memcpy(hdrp->sll_addr, from.sll_addr,
@@ -1130,6 +1126,40 @@ pcap_setdirection_linux(pcap_t *handle,
return -1;
}
+
+/*
+ * Map the PACKET_ value to a LINUX_SLL_ value; we
+ * want the same numerical value to be used in
+ * the link-layer header even if the numerical values
+ * for the PACKET_ #defines change, so that programs
+ * that look at the packet type field will always be
+ * able to handle DLT_LINUX_SLL captures.
+ */
+static short int
+map_packet_type_to_sll_type(short int sll_pkttype)
+{
+ switch (sll_pkttype) {
+
+ case PACKET_HOST:
+ return htons(LINUX_SLL_HOST);
+
+ case PACKET_BROADCAST:
+ return htons(LINUX_SLL_BROADCAST);
+
+ case PACKET_MULTICAST:
+ return htons(LINUX_SLL_MULTICAST);
+
+ case PACKET_OTHERHOST:
+ return htons(LINUX_SLL_OTHERHOST);
+
+ case PACKET_OUTGOING:
+ return htons(LINUX_SLL_OUTGOING);
+
+ default:
+ return -1;
+ }
+}
+
/*
* Linux uses the ARP hardware type to identify the type of an
* interface. pcap uses the DLT_xxx constants for this. This
@@ -1653,6 +1683,192 @@ live_open_new(pcap_t *handle, const char
#endif
}
+static int
+live_open_mmap(pcap_t* handle, char* errmsg)
+{
+#ifdef HAVE_PACKET_RING
+ int allow_retry = 0;
+ int frames_per_block, frame_req_size;
+ struct tpacket_req req;
+
+ /* use higher 16 bits to specify frame number and low 16 to specify
+ * frame size. Assume sane default if this values are not good */
+ frame_req_size = (handle->snapshot & 0xffff) + TPACKET_HDRLEN;
+ req.tp_frame_nr = handle->snapshot >> 16;
+ if (req.tp_frame_nr < 1) {
+ req.tp_frame_nr = 2048;
+ allow_retry = 1;
+ }
+
+ /* round the requested frame length to the nearest greater power of 2
+ * This not imposed by the linux API, but semplify the ring navigation */
+ req.tp_frame_size = 32;
+ while (req.tp_frame_size < frame_req_size)
+ req.tp_frame_size <<= 1;
+
+ /* compute the minumum block size that will handle this frame.
+ * The block have to be page size aligned.
+ * The max block size allowed by the kernel is arch-dependend, but is
+ * greater than 128K on all archs and we can't obtain a frame size
+ * greater that 128K*/
+ req.tp_block_size = getpagesize();
+ while (req.tp_block_size < frame_req_size)
+ req.tp_block_size <<= 1;
+
+ frames_per_block = req.tp_block_size/req.tp_frame_size;
+ req.tp_block_nr = req.tp_frame_nr/frames_per_block;
+ if (req.tp_block_nr > MAX_BLOCK_NR)
+ req.tp_block_nr = MAX_BLOCK_NR;
+
+ /* ask the kernel to create the ring */
+retry:
+ if (setsockopt(handle->fd, SOL_PACKET, PACKET_RX_RING,
+ (void *) &req, sizeof(req))) {
+ /* if the ring size has not been choosen by the user, try
+ * to shrink it to prevent memory failure */
+ if ((errno == ENOMEM) && allow_retry && (req.tp_block_nr > 1)) {
+ req.tp_frame_nr>>=1;
+ req.tp_block_nr = req.tp_frame_nr/frames_per_block;
+ goto retry;
+ }
+ snprintf(errmsg, PCAP_ERRBUF_SIZE, "can't create rx ring on "
+ "packet socket %d: %d-%s", handle->fd, errno,
+ pcap_strerror(errno));
+ return 0;
+ }
+
+ /* memory map the rx ring */
+ handle->cc = req.tp_block_nr * req.tp_block_size;
+ handle->bp = mmap(0, handle->cc, PROT_READ| PROT_WRITE, MAP_SHARED,
+ handle->fd, 0);
+ if (handle->bp == MAP_FAILED) {
+ snprintf(errmsg, PCAP_ERRBUF_SIZE, "can't mmap rx ring: %d-%s",
+ errno, pcap_strerror(errno));
+
+ /* clear the allocated ring on error*/
+ memset(&req, 0, sizeof(req));
+ setsockopt(handle->fd, SOL_PACKET, PACKET_RX_RING,
+ (void *) &req, sizeof(req));
+ return 0;
+ }
+
+ /* override some default and inherit the others field from open_live_new
+ * handle->offset is used to get the current position into the rx ring
+ * handle->cc is used to store the ring size */
+ handle->snapshot &= 0xffff;
+ handle->read_op = pcap_read_linux_mmap;
+ handle->close_op = pcap_close_linux_mmap;
+ handle->bufsize = req.tp_frame_size;
+ handle->selectable_fd = handle->fd;
+ handle->offset = 0;
+ handle->buffer = 0;
+ return 1;
+#else /* HAVE_PACKET_RING */
+ return 0;
+#endif /* HAVE_PACKET_RING */
+}
+
+#ifdef HAVE_PACKET_RING
+static int
+pcap_read_linux_mmap(pcap_t *handle, int max_packets, pcap_handler callback,
+ u_char *user)
+{
+ int pkts = 0;
+
+ /* wait for frames availability.*/
+ if (!(*((unsigned*)&handle->bp[handle->offset]))) {
+ struct pollfd pollinfo;
+ int ret;
+
+ pollinfo.fd = handle->fd;
+ pollinfo.events = POLLIN;
+
+ do {
+ /* poll() requires a negative timeout to wait forever */
+ ret = poll(&pollinfo, 1, (handle->md.timeout > 0)?
+ handle->md.timeout: -1);
+ if ((ret < 0) && (errno != EINTR)) {
+ snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
+ "can't poll on packet socket fd %d: %d-%s",
+ handle->fd, errno, pcap_strerror(errno));
+ return -1;
+ }
+ /* check for break loop condition on interrupted syscall*/
+ if (handle->break_loop)
+ return -2;
+ } while (ret < 0);
+ }
+
+ while ((pkts < max_packets) || (max_packets <0)) {
+ struct sockaddr_ll *sll;
+ struct pcap_pkthdr pcaphdr;
+ unsigned char *bp;
+ struct tpacket_hdr* thdr = (struct tpacket_hdr*)&handle->bp[handle->offset];
+ if (thdr->tp_status == TP_STATUS_KERNEL)
+ break;
+
+ /* perform sanity check on internal offset. */
+ if (thdr->tp_mac >= handle->bufsize)
+ return -1;
+
+ /* run filter on received packet */
+ bp = (unsigned char*)thdr + thdr->tp_mac;
+ if (!handle->md.use_bpf && handle->fcode.bf_insns &&
+ (bpf_filter(handle->fcode.bf_insns, bp,
+ thdr->tp_len, thdr->tp_snaplen) == 0))
+ goto skip;
+
+ /* check direction and interface index */
+ sll = (void*)thdr + TPACKET_ALIGN(sizeof(*thdr));
+ if ((sll->sll_ifindex == handle->md.lo_ifindex) &&
+ (sll->sll_pkttype == PACKET_OUTGOING))
+ goto skip;
+
+ /* get required packet info from ring header */
+ pcaphdr.ts.tv_sec = thdr->tp_sec;
+ pcaphdr.ts.tv_usec = thdr->tp_usec;
+ pcaphdr.caplen = thdr->tp_snaplen;
+ pcaphdr.len = thdr->tp_len;
+
+ /* if required build in place the sll header*/
+ if (handle->md.cooked) {
+ struct sll_header *hdrp = (struct sll_header *)((char *)bp - sizeof(struct sll_header));
+
+ hdrp->sll_pkttype = map_packet_type_to_sll_type(
+ sll->sll_pkttype);
+ hdrp->sll_hatype = htons(sll->sll_hatype);
+ hdrp->sll_halen = htons(sll->sll_halen);
+ memcpy(hdrp->sll_addr, sll->sll_addr, SLL_ADDRLEN);
+ hdrp->sll_protocol = sll->sll_protocol;
+
+ /* update packet len */
+ pcaphdr.caplen += SLL_HDR_LEN;
+ pcaphdr.len += SLL_HDR_LEN;
+ }
+
+ /* pass the packet to the user */
+ pkts++;
+ callback(user, &pcaphdr, bp);
+ handle->md.packets_read++;
+
+skip:
+ /* next packet*/
+ thdr->tp_status = TP_STATUS_KERNEL;
+ handle->offset += handle->bufsize;
+ if (handle->offset >= handle->cc)
+ handle->offset = 0;
+
+ /* check for break loop condition*/
+ if (handle->break_loop) {
+ handle->break_loop = 0;
+ return -2;
+ }
+ }
+ return pkts;
+}
+#endif /* HAVE_PACKET_RING */
+
+
#ifdef HAVE_PF_PACKET_SOCKETS
/*
* Return the index of the given device name. Fill ebuf and return
@@ -1826,6 +2042,11 @@ static void pcap_close_linux( pcap_t *ha
#endif /* SITA */
}
+static void pcap_close_linux_mmap( pcap_t *handle )
+{
+ munmap(handle->bp, handle->cc);
+ pcap_close_linux(handle);
+}
/*
* Try to open a packet socket using the old kernel interface.
* Returns 0 on failure.
-
This is the tcpdump-workers list.
Visit https://cod.sandelman.ca/ to unsubscribe.