On 3/09/2015, at 10:41 AM, Richard Procter wrote: > [...] or for that matter just unit-test the pf_change() calls. I > think I'll rustle up some of the later.
That was a useful exercise. The pf_change_*() code is fine. However, pf_cksum_fixup_a() reliably faulted a couple of times over 2E7 random address, so it was back to the drawing board. It turns out that 5.3's checksum modification algorithm is much hairer than it appears and I was fooling myself I understood it; the argument I provided it correct, it just doesn't establish the conclusion! The lesson is I guess not to re-purpose highly optimised code; instead refine something simple. I've trialed four alternatives on three architectures and will cover this in a later patch. In the meantime, unittest code can be found here[0]. The attached against HEAD is the same as my last but does not include pf_cksum_fixup_a() and adds some comments to pf_cksum_fixup(). 0) initialise pd->pcksum = icmp6 header cksum 3) re-introduce pf_cksum_fixup() (but not pf_cksum_fixup_a() 4) introduce pf_change_*() 22) misc. transitions to pf_change_*() interface I will be offline for a couple of days. best, Richard. [0] https://203.79.107.124/unittest/unittest.tar.gz Index: net/pf.c =================================================================== --- net.orig/pf.c +++ net/pf.c @@ -145,7 +145,8 @@ void pf_init_threshold(struct pf_thre u_int32_t); void pf_add_threshold(struct pf_threshold *); int pf_check_threshold(struct pf_threshold *); - +void pf_cksum_fixup(u_int16_t *, u_int16_t, u_int16_t, + u_int8_t); void pf_change_ap(struct pf_pdesc *, struct pf_addr *, u_int16_t *, struct pf_addr *, u_int16_t, sa_family_t); @@ -1651,6 +1652,105 @@ pf_addr_wrap_neq(struct pf_addr_wrap *aw } } +/* This function is deceptively simple, but is in fact an incredibly hairy + * optimised special case of an algorithm to emulates 16-bit ones-complement + * arithmetic by conserving its carries, which twos-complement otherwise + * discards, in the upper 16 bits of x. These accumulated carries when added + * to the lower 16-bits over a number of 'reductions' then complete the + * ones-complement sum. + * + * It uses a trick to eliminate a reduction step by emulating in + * twos-complement ones-complement subtraction for a single u_int16_t, + * thereby reducing the number of accumulated carries to at most one, and + * saving one each of +, >>, and ~. + * + * The emulation works as follows: subtracting exactly one u_int16_t from x + * incurs at most one net underflow, wrapping the accumulator to 2^16 - 1, + * which, when folded back into the 16-bit sum preserves the ones-complement + * borrow: + * + * ((x >> 16) + (x & 0xffff)) & 0xffff + * = { x mod 2^n = x & (2^n - 1) } + * ((x >> 16) + (x mod 2^16)) mod 2^16 + * = { def sum, def accumulator } + * (accumulator + sum) mod 2^16 + * = { one underflow, accumulator := 2^16 - 1 } + * ((2^16 - 1) + sum) mod 2^16 + * = { mod } + * (sum - 1) mod 2^16 + * + * The last step accounts for the reduction's trailing '& 0xffff' (which + * would, in general, destroy accumulated carries, but not here as a single + * add requires at most one reduction). + * + * And although this breaks for sum = 0, giving 0xffff, which is + * ones-complement's other zero, not -1, that cannot occur: underflowing the + * sum to zero requires subtracting at least 2^16, which exceeds a single + * u_int16_t. + */ +void +pf_cksum_fixup(u_int16_t *cksum, u_int16_t was, u_int16_t now, + u_int8_t proto) +{ + u_int32_t x; + const int udp = proto == IPPROTO_UDP; + + if (udp && *cksum == 0x0000) + return; + + x = *cksum + was - now; + x = ((x >> 16) + (x & 0xffff)) & 0xffff; + + if (udp && x == 0x0000) + x = 0xffff; + + *cksum = (u_int16_t)(x); +} + +void +pf_change_8(struct pf_pdesc *pd, u_int8_t *f, u_int8_t v, bool hi) +{ + u_int16_t new = htons(hi ? ( v << 8) : v); + u_int16_t old = htons(hi ? (*f << 8) : *f); + + pf_cksum_fixup(pd->pcksum, old, new, pd->proto); + *f = v; +} + +/* pre: *f is 16-bit aligned within its packet */ +void +pf_change_16(struct pf_pdesc *pd, u_int16_t *f, u_int16_t v) +{ + pf_cksum_fixup(pd->pcksum, *f, v, pd->proto); + *f = v; +} + +void +pf_change_16_unaligned(struct pf_pdesc *pd, void *f, u_int16_t v, bool hi) +{ + u_int8_t *fb = (u_int8_t*)f; + u_int8_t *vb = (u_int8_t*)&v; + + if (hi && ALIGNED_POINTER(f, u_int16_t)) { + pf_change_16(pd, f, v); /* optimise */ + return; + } + + pf_change_8(pd, fb++, *vb++, hi); + pf_change_8(pd, fb++, *vb++,!hi); +} + +/* pre: *f is 16-bit aligned within its packet */ +void +pf_change_32(struct pf_pdesc *pd, u_int32_t *f, u_int32_t v) +{ + u_int16_t *pc = pd->pcksum; + + pf_cksum_fixup(pc, *f / (1 << 16), v / (1 << 16), pd->proto); + pf_cksum_fixup(pc, *f % (1 << 16), v % (1 << 16), pd->proto); + *f = v; +} + void pf_change_ap(struct pf_pdesc *pd, struct pf_addr *a, u_int16_t *p, struct pf_addr *an, u_int16_t pn, sa_family_t naf) @@ -3724,11 +3824,8 @@ pf_translate(struct pf_pdesc *pd, struct u_int16_t icmpid = (icmp_dir == PF_IN) ? sport : dport; if (icmpid != pd->hdr.icmp->icmp_id) { - if (pd->csum_status == PF_CSUM_UNKNOWN) - pf_check_proto_cksum(pd, pd->off, - pd->tot_len - pd->off, pd->proto, - pd->af); - pd->hdr.icmp->icmp_id = icmpid; + pf_change_16(pd, + &pd->hdr.icmp->icmp_id, icmpid); rewrite = 1; } } @@ -3760,11 +3857,8 @@ pf_translate(struct pf_pdesc *pd, struct u_int16_t icmpid = (icmp_dir == PF_IN) ? sport : dport; if (icmpid != pd->hdr.icmp6->icmp6_id) { - if (pd->csum_status == PF_CSUM_UNKNOWN) - pf_check_proto_cksum(pd, pd->off, - pd->tot_len - pd->off, pd->proto, - pd->af); - pd->hdr.icmp6->icmp6_id = icmpid; + pf_change_16(pd, + &pd->hdr.icmp6->icmp6_id, icmpid); rewrite = 1; } } @@ -4572,11 +4666,8 @@ pf_test_state_icmp(struct pf_pdesc *pd, } if (nk->port[iidx] != pd->hdr.icmp->icmp_id) { - if (pd->csum_status == PF_CSUM_UNKNOWN) - pf_check_proto_cksum(pd, - pd->off, pd->tot_len - - pd->off, pd->proto, pd->af); - pd->hdr.icmp->icmp_id = nk->port[iidx]; + pf_change_16(pd, &pd->hdr.icmp->icmp_id, + nk->port[iidx]); } m_copyback(pd->m, pd->off, ICMP_MINLEN, @@ -4604,12 +4695,9 @@ pf_test_state_icmp(struct pf_pdesc *pd, } if (nk->port[iidx] != pd->hdr.icmp6->icmp6_id) { - if (pd->csum_status == PF_CSUM_UNKNOWN) - pf_check_proto_cksum(pd, - pd->off, pd->tot_len - - pd->off, pd->proto, pd->af); - pd->hdr.icmp6->icmp6_id = - nk->port[iidx]; + pf_change_16(pd, + &pd->hdr.icmp6->icmp6_id, + nk->port[iidx]); } m_copyback(pd->m, pd->off, @@ -6230,6 +6318,7 @@ pf_setup_pdesc(struct pf_pdesc *pd, void REASON_SET(reason, PFRES_SHORT); return (PF_DROP); } + pd->pcksum = &pd->hdr.icmp6->icmp6_cksum; break; } #endif /* INET6 */ Index: net/pfvar.h =================================================================== --- net.orig/pfvar.h +++ net/pfvar.h @@ -1719,6 +1719,13 @@ void pf_addr_inc(struct pf_addr *, sa_fa void *pf_pull_hdr(struct mbuf *, int, void *, int, u_short *, u_short *, sa_family_t); +#define PF_HI (true) +#define PF_LO (!PF_HI) +#define PF_ALGNMNT(off) (((off) % 2) == 0 ? PF_HI : PF_LO) +void pf_change_8(struct pf_pdesc *, u_int8_t *, u_int8_t, bool); +void pf_change_16(struct pf_pdesc *, u_int16_t *, u_int16_t); +void pf_change_16_unaligned(struct pf_pdesc *, void *, u_int16_t, bool); +void pf_change_32(struct pf_pdesc *, u_int32_t *, u_int32_t); void pf_change_a(struct pf_pdesc *, void *, u_int32_t); int pf_check_proto_cksum(struct pf_pdesc *, int, int, u_int8_t, sa_family_t); Index: net/pf_norm.c =================================================================== --- net.orig/pf_norm.c +++ net/pf_norm.c @@ -844,10 +844,6 @@ pf_normalize_tcp(struct pf_pdesc *pd) u_int8_t flags; u_int rewrite = 0; - if (pd->csum_status == PF_CSUM_UNKNOWN) - pf_check_proto_cksum(pd, pd->off, pd->tot_len - pd->off, - pd->proto, pd->af); - flags = th->th_flags; if (flags & TH_SYN) { /* Illegal packet */ @@ -869,15 +865,18 @@ pf_normalize_tcp(struct pf_pdesc *pd) } /* If flags changed, or reserved data set, then adjust */ - if (flags != th->th_flags || th->th_x2 != 0) { - th->th_flags = flags; - th->th_x2 = 0; - rewrite = 1; - } + if (flags != th->th_flags || th->th_x2 != 0) { + /* hack: set 4-bit th_x2 = 0 */ + u_int8_t *th_off = (u_int8_t*)(&th->th_ack+1); + pf_change_8(pd, th_off, th->th_off << 4, PF_HI); + + pf_change_8(pd, &th->th_flags, flags, PF_LO); + rewrite = 1; + } /* Remove urgent pointer, if TH_URG is not set */ if (!(flags & TH_URG) && th->th_urp) { - th->th_urp = 0; + pf_change_16(pd, &th->th_urp, 0); rewrite = 1; } @@ -1381,12 +1380,8 @@ pf_normalize_mss(struct pf_pdesc *pd, u_ u_int16_t mss; int thoff; int opt, cnt, optlen = 0; - u_char opts[MAX_TCPOPTLEN]; - u_char *optp = opts; - - if (pd->csum_status == PF_CSUM_UNKNOWN) - pf_check_proto_cksum(pd, pd->off, pd->tot_len - pd->off, - pd->proto, pd->af); + u_int8_t opts[MAX_TCPOPTLEN]; + u_int8_t *optp = opts; thoff = th->th_off << 2; cnt = thoff - sizeof(struct tcphdr); @@ -1409,12 +1404,15 @@ pf_normalize_mss(struct pf_pdesc *pd, u_ break; } if (opt == TCPOPT_MAXSEG) { - memcpy(&mss, (optp + 2), 2); + u_int8_t *mssp = optp + 2; + memcpy(&mss, mssp, sizeof(mss)); if (ntohs(mss) > maxmss) { - mss = htons(maxmss); + size_t mssoffopts = mssp - opts; + pf_change_16_unaligned(pd, &mss, + htons(maxmss), PF_ALGNMNT(mssoffopts)); m_copyback(pd->m, - pd->off + sizeof(*th) + optp + 2 - opts, - 2, &mss, M_NOWAIT); + pd->off + sizeof(*th) + mssoffopts, + sizeof(mss), &mss, M_NOWAIT); pf_cksum(pd, pd->m); m_copyback(pd->m, pd->off, sizeof(*th), th, M_NOWAIT);