Please test the attached diff with tcpdump and pflogd filter expressions you commonly use to make sure it does not break anything.
This is needed for Henning's work on pflog. Instructions: 1. Apply the patch under /usr/src/lib/libpcap, 2. re-build libpcap and install it. 3. restart pflogd The patch changes the way BPF code is generated from filter expressions for the pflog interface, to allow for variable-sized pflog headers. Things to test for: 1. Run your favorite filter expressions on both pflog and non-pflog interfaces, and make sure the packets are filtered correctly. 2. Make sure pflogd still generates correct logs, especially if you are using a filter expression. 3. Try complex filter expressions on the pflog interface and see if they still fit the memory (there is an upper bound to the number of instructions, and this diff increases the size of the generated code). 4. dump the result of some tcpdump filter code before and after applying this patch for *non-pflog* interfaces. There should be *no change*. You can use '-d' switch to tcpdump to dump the filter code. You can use '-O' to disable optimization. Example: tcpdump -i em0 -d host 1.2.3.4 and port 80 Please let me know about your test results, what you tested, and the architecture you tested on. Background: Years ago, when designing the new pflog header, we assumed that the header would have to be extended in the future. We did not want to switch DLT numbers again, so we added a 'size' field which also doubled as a 'version' field. This would allow us to add fields to the pflog header in the future and still be backwards compatible. Unfortunately, our plans were foiled by the mess that was libpcap. The bpf filtering language and filter code generator in libpcap was designed with the assumption that the link layer has a fixed size (this is also the reason why there are two 802_11 header types for WiFi). Now that Henning has some real need to add to the pflog header, he asked me to revisit that code and try to get bpf filtering work on variable sized pflog headers. Technical details: BPF uses a very simple language for testing packets against a filter language. The filter expression you specify in tcpdump is parsed and assembled into a bytecode that the kernel can use to match packets. Each interface has a specific link-type (DLT_EN10MB for ethernet, DLT_PFLOG for pflog etc.) and each link type has a header infront of the packet itself. The code generation itself is done in libpcap(4) for simplicity, the size of the link-header for each link-type is hardcoded into the code generator. The diff modifies the code generator to handle variable-sized link-headers. For pflog, which has the header length in the header itself, the generator generates additional BPF code to parse the pflog link-header to determine the size of the link header instead of assuming a fixed size. The two important offsets based on the link header are 1) the start of the link payload (eg. IP header) and 2) the start of the protocol header (eg. TCP header) these values require a number of BPF instructions to compute, and used extensively by most filter constructs. To prevent the code size from increasing too much, the offsets are computed once, at the start, and the results are placed in 'registers'. Let me know if you have any questions. Thanks, Can Index: gencode.c =================================================================== RCS file: /cvs/src/lib/libpcap/gencode.c,v retrieving revision 1.33 diff -u -p -r1.33 gencode.c --- gencode.c 26 Jun 2010 16:47:07 -0000 1.33 +++ gencode.c 1 Jul 2010 04:51:46 -0000 @@ -99,6 +99,13 @@ static void free_reg(int); static struct block *root; +/* initialization code used for variable link header */ +static struct slist *init_code = NULL; + +/* Flags and registers for variable link type handling */ +static int variable_nl; +static int nl_reg, iphl_reg; + /* * We divy out chunks of memory rather than call malloc each time so * we don't have to worry about leaking memory. It's probably @@ -126,7 +133,9 @@ static void backpatch(struct block *, st static void merge(struct block *, struct block *); static struct block *gen_cmp(u_int, u_int, bpf_int32); static struct block *gen_cmp_gt(u_int, u_int, bpf_int32); +static struct block *gen_cmp_nl(u_int, u_int, bpf_int32); static struct block *gen_mcmp(u_int, u_int, bpf_int32, bpf_u_int32); +static struct block *gen_mcmp_nl(u_int, u_int, bpf_int32, bpf_u_int32); static struct block *gen_bcmp(u_int, u_int, const u_char *); static struct block *gen_uncond(int); static __inline struct block *gen_true(void); @@ -296,6 +305,12 @@ pcap_compile(pcap_t *p, struct bpf_progr if (root == NULL) root = gen_retblk(snaplen); + /* prepend initialization code to root */ + if (init_code != NULL && root != NULL) { + sappend(init_code, root->stmts); + root->stmts = init_code; + init_code = NULL; + } if (optimize && !no_optimize) { bpf_optimize(&root); if (root == NULL || @@ -344,6 +359,12 @@ pcap_compile_nopcap(int snaplen_arg, int if (root == NULL) root = gen_retblk(snaplen_arg); + /* prepend initialization code to root */ + if (init_code != NULL && root != NULL) { + sappend(init_code, root->stmts); + root->stmts = init_code; + init_code = NULL; + } if (optimize) { bpf_optimize(&root); if (root == NULL || @@ -421,6 +442,15 @@ finish_parse(p) p->sense = !p->sense; backpatch(p, gen_retblk(0)); root = p->head; + + if (iphl_reg != -1) { + free_reg(iphl_reg); + iphl_reg = -1; + } + if (nl_reg != -1) { + free_reg(nl_reg); + nl_reg = -1; + } } void @@ -507,6 +537,23 @@ gen_mcmp(offset, size, v, mask) } static struct block * +gen_mcmp_nl(offset, size, v, mask) + u_int offset, size; + bpf_int32 v; + bpf_u_int32 mask; +{ + struct block *b = gen_cmp_nl(offset, size, v); + struct slist *s; + + if (mask != 0xffffffff) { + s = new_stmt(BPF_ALU|BPF_AND|BPF_K); + s->s.k = mask; + b->stmts->next = s; + } + return b; +} + +static struct block * gen_bcmp(offset, size, v) register u_int offset, size; register const u_char *v; @@ -555,11 +602,83 @@ static u_int off_nl_nosnap; static int linktype; +static struct slist * +nl2X_stmt(void) +{ + struct slist *s, *tmp; + + if (nl_reg == -1) { + switch (linktype) { + case DLT_PFLOG: + /* The pflog header contains PFLOG_REAL_HDRLEN + which does NOT include the padding. Round + up to the nearest dword boundary */ + s = new_stmt(BPF_LD|BPF_B|BPF_ABS); + s->s.k = 0; + + tmp = new_stmt(BPF_ALU|BPF_ADD|BPF_K); + tmp->s.k = 3; + sappend(s, tmp); + + tmp = new_stmt(BPF_ALU|BPF_AND|BPF_K); + tmp->s.k = 0xf6; + sappend(s, tmp); + + nl_reg = alloc_reg(); + tmp = new_stmt(BPF_ST); + tmp->s.k = nl_reg; + sappend(s, tmp); + + break; + default: + bpf_error("Unknown header size for link type 0x%x", + linktype); + } + + if (init_code == NULL) + init_code = s; + else + sappend(init_code, s); + } + + s = new_stmt(BPF_LDX|BPF_MEM); + s->s.k = nl_reg; + + return s; +} + + +static struct block * +gen_cmp_nl(offset, size, v) + u_int offset, size; + bpf_int32 v; +{ + struct slist *s, *tmp; + struct block *b; + + if (variable_nl) { + s = nl2X_stmt(); + tmp = new_stmt(BPF_LD|BPF_IND|size); + tmp->s.k = offset; + sappend(s, tmp); + } else { + s = new_stmt(BPF_LD|BPF_ABS|size); + s->s.k = offset + off_nl; + } + b = new_block(JMP(BPF_JEQ)); + b->stmts = s; + b->s.k = v; + + return b; +} + static void init_linktype(type) int type; { linktype = type; + init_code = NULL; + nl_reg = iphl_reg = -1; switch (type) { @@ -660,8 +779,8 @@ init_linktype(type) case DLT_PFLOG: off_linktype = 0; - /* XXX read from header? */ - off_nl = PFLOG_HDRLEN; + variable_nl = 1; + off_nl = 0; return; case DLT_PFSYNC: @@ -849,7 +968,7 @@ gen_hostop(addr, mask, dir, proto, src_o linktype); } b0 = gen_linktype(proto); - b1 = gen_mcmp(offset, BPF_W, (bpf_int32)addr, mask); + b1 = gen_mcmp_nl(offset, BPF_W, (bpf_int32)addr, mask); gen_and(b0, b1); return b1; } @@ -896,12 +1015,12 @@ gen_hostop6(addr, mask, dir, proto, src_ /* this order is important */ a = (u_int32_t *)addr; m = (u_int32_t *)mask; - b1 = gen_mcmp(offset + 12, BPF_W, ntohl(a[3]), ntohl(m[3])); - b0 = gen_mcmp(offset + 8, BPF_W, ntohl(a[2]), ntohl(m[2])); + b1 = gen_mcmp_nl(offset + 12, BPF_W, ntohl(a[3]), ntohl(m[3])); + b0 = gen_mcmp_nl(offset + 8, BPF_W, ntohl(a[2]), ntohl(m[2])); gen_and(b0, b1); - b0 = gen_mcmp(offset + 4, BPF_W, ntohl(a[1]), ntohl(m[1])); + b0 = gen_mcmp_nl(offset + 4, BPF_W, ntohl(a[1]), ntohl(m[1])); gen_and(b0, b1); - b0 = gen_mcmp(offset + 0, BPF_W, ntohl(a[0]), ntohl(m[0])); + b0 = gen_mcmp_nl(offset + 0, BPF_W, ntohl(a[0]), ntohl(m[0])); gen_and(b0, b1); b0 = gen_linktype(proto); gen_and(b0, b1); @@ -1047,26 +1166,26 @@ gen_dnhostop(addr, dir, base_off) } b0 = gen_linktype(ETHERTYPE_DN); /* Check for pad = 1, long header case */ - tmp = gen_mcmp(base_off + 2, BPF_H, - (bpf_int32)ntohs(0x0681), (bpf_int32)ntohs(0x07FF)); - b1 = gen_cmp(base_off + 2 + 1 + offset_lh, + tmp = gen_mcmp_nl(base_off + 2, BPF_H, + (bpf_int32)ntohs(0x0681), (bpf_int32)ntohs(0x07FF)); + b1 = gen_cmp_nl(base_off + 2 + 1 + offset_lh, BPF_H, (bpf_int32)ntohs(addr)); gen_and(tmp, b1); /* Check for pad = 0, long header case */ - tmp = gen_mcmp(base_off + 2, BPF_B, (bpf_int32)0x06, (bpf_int32)0x7); - b2 = gen_cmp(base_off + 2 + offset_lh, BPF_H, (bpf_int32)ntohs(addr)); + tmp = gen_mcmp_nl(base_off + 2, BPF_B, (bpf_int32)0x06, (bpf_int32)0x7); + b2 = gen_cmp_nl(base_off + 2 + offset_lh, BPF_H, (bpf_int32)ntohs(addr)); gen_and(tmp, b2); gen_or(b2, b1); /* Check for pad = 1, short header case */ - tmp = gen_mcmp(base_off + 2, BPF_H, - (bpf_int32)ntohs(0x0281), (bpf_int32)ntohs(0x07FF)); - b2 = gen_cmp(base_off + 2 + 1 + offset_sh, + tmp = gen_mcmp_nl(base_off + 2, BPF_H, + (bpf_int32)ntohs(0x0281), (bpf_int32)ntohs(0x07FF)); + b2 = gen_cmp_nl(base_off + 2 + 1 + offset_sh, BPF_H, (bpf_int32)ntohs(addr)); gen_and(tmp, b2); gen_or(b2, b1); /* Check for pad = 0, short header case */ - tmp = gen_mcmp(base_off + 2, BPF_B, (bpf_int32)0x02, (bpf_int32)0x7); - b2 = gen_cmp(base_off + 2 + offset_sh, BPF_H, (bpf_int32)ntohs(addr)); + tmp = gen_mcmp_nl(base_off + 2, BPF_B, (bpf_int32)0x02, (bpf_int32)0x7); + b2 = gen_cmp_nl(base_off + 2 + offset_sh, BPF_H, (bpf_int32)ntohs(addr)); gen_and(tmp, b2); gen_or(b2, b1); @@ -1096,15 +1215,15 @@ gen_host(addr, mask, proto, dir) case Q_IP: return gen_hostop(addr, mask, dir, ETHERTYPE_IP, - off_nl + 12, off_nl + 16); + 12, 16); case Q_RARP: return gen_hostop(addr, mask, dir, ETHERTYPE_REVARP, - off_nl + 14, off_nl + 24); + 14, 24); case Q_ARP: return gen_hostop(addr, mask, dir, ETHERTYPE_ARP, - off_nl + 14, off_nl + 24); + 14, 24); case Q_TCP: bpf_error("'tcp' modifier applied to host"); @@ -1131,7 +1250,7 @@ gen_host(addr, mask, proto, dir) bpf_error("ATALK host filtering not implemented"); case Q_DECNET: - return gen_dnhostop(addr, dir, off_nl); + return gen_dnhostop(addr, dir, 0); case Q_SCA: bpf_error("SCA host filtering not implemented"); @@ -1229,7 +1348,7 @@ gen_host6(addr, mask, proto, dir) case Q_IPV6: return gen_hostop6(addr, mask, dir, ETHERTYPE_IPV6, - off_nl + 8, off_nl + 24); + 8, 24); case Q_ICMPV6: bpf_error("'icmp6' modifier applied to host"); @@ -1430,12 +1549,19 @@ gen_proto_abbrev(proto) static struct block * gen_ipfrag() { - struct slist *s; + struct slist *s, *tmp; struct block *b; /* not ip frag */ - s = new_stmt(BPF_LD|BPF_H|BPF_ABS); - s->s.k = off_nl + 6; + if (variable_nl) { + s = nl2X_stmt(); + tmp = new_stmt(BPF_LD|BPF_H|BPF_IND); + tmp->s.k = 6; + sappend(s, tmp); + } else { + s = new_stmt(BPF_LD|BPF_H|BPF_ABS); + s->s.k = off_nl + 6; + } b = new_block(JMP(BPF_JSET)); b->s.k = 0x1fff; b->stmts = s; @@ -1444,19 +1570,67 @@ gen_ipfrag() return b; } +static struct slist * +iphl_to_x(void) +{ + struct slist *s, *tmp; + + /* XXX clobbers A if variable_nl*/ + if (variable_nl) { + if (iphl_reg == -1) { + /* X <- nl_off */ + s = nl2X_stmt(); + + /* A = p[X+0] */ + tmp = new_stmt(BPF_LD|BPF_B|BPF_IND); + tmp->s.k = 0; + sappend(s, tmp); + + /* A = A & 0x0f */ + tmp = new_stmt(BPF_ALU|BPF_AND|BPF_K); + tmp->s.k = 0x0f; + sappend(s, tmp); + + /* A = A << 2 */ + tmp = new_stmt(BPF_ALU|BPF_LSH|BPF_K); + tmp->s.k = 2; + sappend(s, tmp); + + /* A = A + X (add off_nl again to compansate) */ + sappend(s, new_stmt(BPF_ALU|BPF_ADD|BPF_X)); + + /* MEM[iphl_reg] = A */ + iphl_reg = alloc_reg(); + tmp = new_stmt(BPF_ST); + tmp->s.k = iphl_reg; + sappend(s, tmp); + + sappend(init_code, s); + } + s = new_stmt(BPF_LDX|BPF_MEM); + s->s.k = iphl_reg; + + } else { + s = new_stmt(BPF_LDX|BPF_MSH|BPF_B); + s->s.k = off_nl; + } + + return s; +} + static struct block * gen_portatom(off, v) int off; bpf_int32 v; { - struct slist *s; + struct slist *s, *tmp; struct block *b; - s = new_stmt(BPF_LDX|BPF_MSH|BPF_B); - s->s.k = off_nl; + s = iphl_to_x(); - s->next = new_stmt(BPF_LD|BPF_IND|BPF_H); - s->next->s.k = off_nl + off; + tmp = new_stmt(BPF_LD|BPF_IND|BPF_H); + tmp->s.k = off_nl + off; /* off_nl == 0 if variable_nl */ + sappend(s, tmp); b = new_block(JMP(BPF_JEQ)); b->stmts = s; @@ -1471,7 +1645,7 @@ gen_portatom6(off, v) int off; bpf_int32 v; { - return gen_cmp(off_nl + 40 + off, BPF_H, v); + return gen_cmp_nl(40 + off, BPF_H, v); } #endif/*INET6*/ @@ -1482,7 +1656,7 @@ gen_portop(port, proto, dir) struct block *b0, *b1, *tmp; /* ip proto 'proto' */ - tmp = gen_cmp(off_nl + 9, BPF_B, (bpf_int32)proto); + tmp = gen_cmp_nl(9, BPF_B, (bpf_int32)proto); b0 = gen_ipfrag(); gen_and(tmp, b0); @@ -1554,7 +1728,7 @@ gen_portop6(port, proto, dir) struct block *b0, *b1, *tmp; /* ip proto 'proto' */ - b0 = gen_cmp(off_nl + 6, BPF_B, (bpf_int32)proto); + b0 = gen_cmp_nl(6, BPF_B, (bpf_int32)proto); switch (dir) { case Q_SRC: @@ -1667,6 +1841,11 @@ gen_protochain(v, proto, dir) memset(s, 0, sizeof(s)); fix2 = fix3 = fix4 = fix5 = 0; + if (variable_nl) { + bpf_error("'gen_protochain' not supported for variable DLTs"); + /*NOTREACHED*/ + } + switch (proto) { case Q_IP: case Q_IPV6: @@ -1966,7 +2145,7 @@ gen_proto(v, proto, dir) case Q_IP: b0 = gen_linktype(ETHERTYPE_IP); #ifndef CHASE_CHAIN - b1 = gen_cmp(off_nl + 9, BPF_B, (bpf_int32)v); + b1 = gen_cmp_nl(9, BPF_B, (bpf_int32)v); #else b1 = gen_protochain(v, Q_IP); #endif @@ -2040,7 +2219,7 @@ gen_proto(v, proto, dir) case Q_IPV6: b0 = gen_linktype(ETHERTYPE_IPV6); #ifndef CHASE_CHAIN - b1 = gen_cmp(off_nl + 6, BPF_B, (bpf_int32)v); + b1 = gen_cmp_nl(6, BPF_B, (bpf_int32)v); #else b1 = gen_protochain(v, Q_IPV6); #endif @@ -2560,9 +2739,16 @@ gen_load(proto, index, size) case Q_IPV6: #endif /* XXX Note that we assume a fixed link header here. */ - s = xfer_to_x(index); + if (variable_nl) { + s = nl2X_stmt(); + sappend(s, xfer_to_a(index)); + sappend(s, new_stmt(BPF_ALU|BPF_ADD|BPF_X)); + sappend(s, new_stmt(BPF_MISC|BPF_TAX)); + } else { + s = xfer_to_x(index); + } tmp = new_stmt(BPF_LD|BPF_IND|size); - tmp->s.k = off_nl; + tmp->s.k = off_nl; /* off_nl == 0 for variable_nl */ sappend(s, tmp); sappend(index->s, s); @@ -2578,13 +2764,12 @@ gen_load(proto, index, size) case Q_IGMP: case Q_IGRP: case Q_PIM: - s = new_stmt(BPF_LDX|BPF_MSH|BPF_B); - s->s.k = off_nl; + s = iphl_to_x(); sappend(s, xfer_to_a(index)); sappend(s, new_stmt(BPF_ALU|BPF_ADD|BPF_X)); sappend(s, new_stmt(BPF_MISC|BPF_TAX)); sappend(s, tmp = new_stmt(BPF_LD|BPF_IND|size)); - tmp->s.k = off_nl; + tmp->s.k = off_nl; /* off_nl is 0 if variable_nl */ sappend(index->s, s); gen_and(gen_proto_abbrev(proto), b = gen_ipfrag()); @@ -2874,8 +3059,8 @@ gen_broadcast(proto) case Q_IP: b0 = gen_linktype(ETHERTYPE_IP); hostmask = ~netmask; - b1 = gen_mcmp(off_nl + 16, BPF_W, (bpf_int32)0, hostmask); - b2 = gen_mcmp(off_nl + 16, BPF_W, + b1 = gen_mcmp_nl(16, BPF_W, (bpf_int32)0, hostmask); + b2 = gen_mcmp_nl(16, BPF_W, (bpf_int32)(~0 & hostmask), hostmask); gen_or(b1, b2); gen_and(b0, b2); @@ -2920,7 +3105,7 @@ gen_multicast(proto) case Q_IP: b0 = gen_linktype(ETHERTYPE_IP); - b1 = gen_cmp(off_nl + 16, BPF_B, (bpf_int32)224); + b1 = gen_cmp_nl(16, BPF_B, (bpf_int32)224); b1->s.code = JMP(BPF_JGE); gen_and(b0, b1); return b1; @@ -2928,7 +3113,7 @@ gen_multicast(proto) #ifdef INET6 case Q_IPV6: b0 = gen_linktype(ETHERTYPE_IPV6); - b1 = gen_cmp(off_nl + 24, BPF_B, (bpf_int32)255); + b1 = gen_cmp_nl(24, BPF_B, (bpf_int32)255); gen_and(b0, b1); return b1; #endif /* INET6 */ @@ -3163,6 +3348,11 @@ gen_vlan(vlan_num) int vlan_num; { struct block *b0; + + if (variable_nl) { + bpf_error("'vlan' not supported for variable DLTs"); + /*NOTREACHED*/ + } /* * Change the offsets to point to the type and data fields within