Hello, after reading emails from Philip Guenther and Mark Kettenis, doing some RTFM on locking in OpenBSD kernel I have a new patch. I call it as a smp-step-0.
Patch introduces a KERNEL_LOCK() to PF. Many dances with KERNEL_LOCK() happens in pf_test(). From future work point of view there are distinct parts as follows in pf_test(): - packet sanitization with reassembly. We grab and drop KERNEL_LOCK() pf_noralize_ip*() functions around pf_reassemble*(). Fragment cache will get its own rw-lock later. - as soon as packet is sanitized (or should say normalized), we do state check. State check will get its own rw-lock (I call it smp-step-1) - if no state is found PF proceeds to rule check, which will get its rw-lock too (smp-step-2). Basically KERNEL_LOCK() is placed everywhere, where individual rw-locks will be introduced later. As Philip Guenther has pointed out earlier in email, the user-threads must grab KERNEL_LOCK() before they will jump to rw-locks. The game is easy for PF in ioctl() subsystem, since everything there will be user thread. For packets (pf_test()) function the game is not that clear, since not all packets will be running in user-thread context. My assumption is the packets bound to loopback and local outbound packets are running in user-threads. All other packets (inbound and forwarded) are running in interrupt-threads. For this reason I'd like to introduce the extra code to pf_test(), which decides whether packet requires KERNEL_LOCK(), because it is running as a user thread. I'd like to get the patch in, before proceeding to next SMP step. thanks and regards sasha --------8<---------------8<---------------8<------------------8<-------- Index: net/pf.c =================================================================== RCS file: /cvs/src/sys/net/pf.c,v retrieving revision 1.937 diff -u -p -r1.937 pf.c --- net/pf.c 1 Sep 2015 19:12:25 -0000 1.937 +++ net/pf.c 4 Sep 2015 12:52:55 -0000 @@ -6316,6 +6316,7 @@ pf_test(sa_family_t af, int fwdir, struc union pf_headers pdhdrs; int dir = (fwdir == PF_FWD) ? PF_OUT : fwdir; u_int32_t qid, pqid = 0; + int have_lock = 0; if (!pf_status.running) return (PF_PASS); @@ -6349,6 +6350,26 @@ pf_test(sa_family_t af, int fwdir, struc return (PF_PASS); } + /* + * Unlike kernel-threads, the user-threads require to have + * KENREL_LOCK() before they will touch rw-locks. + * + * The assumption here is fairly simple: + * - all inbound packets run as interrupt (kernel-thread), + * unless they are bound to loopback + * + * - outbound packets run as kernel-thread, if they + * are marked by PF_TAG_KERNEL flag. + * + * in all other cases we must assume packet runs in user-thread + * context and we should get KERNEL_LOCK(). + */ + if ((dir == PF_IN) && (ifp->if_type != IFT_LOOP)) { + (*m0)->m_pkthdr.pf.flags |= PF_TAG_KERNEL; + } else if (((*m0)->m_pkthdr.pf.flags & PF_TAG_KERNEL) == 0) { + PF_LOCK(have_lock); + } + action = pf_setup_pdesc(&pd, &pdhdrs, af, dir, kif, *m0, &reason); if (action != PF_PASS) { #if NPFLOG > 0 @@ -6399,6 +6420,7 @@ pf_test(sa_family_t af, int fwdir, struc * handle fragments that aren't reassembled by * normalization */ + PF_LOCK(have_lock); action = pf_test_rule(&pd, &r, &s, &a, &ruleset); if (action != PF_PASS) REASON_SET(&reason, PFRES_FRAG); @@ -6413,6 +6435,7 @@ pf_test(sa_family_t af, int fwdir, struc "dropping IPv6 packet with ICMPv4 payload"); goto done; } + PF_LOCK(have_lock); action = pf_test_state_icmp(&pd, &s, &reason); if (action == PF_PASS || action == PF_AFRT) { #if NPFSYNC > 0 @@ -6437,6 +6460,7 @@ pf_test(sa_family_t af, int fwdir, struc "dropping IPv4 packet with ICMPv6 payload"); goto done; } + PF_LOCK(have_lock); action = pf_test_state_icmp(&pd, &s, &reason); if (action == PF_PASS || action == PF_AFRT) { #if NPFSYNC > 0 @@ -6461,6 +6485,7 @@ pf_test(sa_family_t af, int fwdir, struc if (action == PF_DROP) goto done; } + PF_LOCK(have_lock); action = pf_test_state(&pd, &s, &reason); if (action == PF_PASS || action == PF_AFRT) { #if NPFSYNC > 0 @@ -6658,6 +6683,8 @@ done: else if (!s->if_index_out && dir == PF_OUT) s->if_index_out = ifp->if_index; } + + PF_UNLOCK(have_lock); return (action); } Index: net/pf_ioctl.c =================================================================== RCS file: /cvs/src/sys/net/pf_ioctl.c,v retrieving revision 1.289 diff -u -p -r1.289 pf_ioctl.c --- net/pf_ioctl.c 21 Jul 2015 02:32:04 -0000 1.289 +++ net/pf_ioctl.c 4 Sep 2015 12:52:57 -0000 @@ -906,6 +906,11 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a return (EACCES); } + /* + * Unlike kernel-thread, the user thread must obtain KERNEL_LOCK() + * before it touches rw-locks. + */ + KERNEL_LOCK(); if (flags & FWRITE) rw_enter_write(&pf_consistency_lock); else @@ -2310,6 +2315,8 @@ fail: rw_exit_write(&pf_consistency_lock); else rw_exit_read(&pf_consistency_lock); + KERNEL_UNLOCK(); + return (error); } Index: net/pf_norm.c =================================================================== RCS file: /cvs/src/sys/net/pf_norm.c,v retrieving revision 1.181 diff -u -p -r1.181 pf_norm.c --- net/pf_norm.c 19 Aug 2015 21:22:41 -0000 1.181 +++ net/pf_norm.c 4 Sep 2015 12:53:00 -0000 @@ -772,6 +772,7 @@ pf_normalize_ip(struct pf_pdesc *pd, u_s struct ip *h = mtod(pd->m, struct ip *); u_int16_t fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3; u_int16_t mff = (ntohs(h->ip_off) & IP_MF); + int rv; if (!fragoff && !mff) goto no_fragment; @@ -794,7 +795,10 @@ pf_normalize_ip(struct pf_pdesc *pd, u_s return (PF_PASS); /* no reassembly */ /* Returns PF_DROP or m is NULL or completely reassembled mbuf */ - if (pf_reassemble(&pd->m, pd->dir, reason) != PF_PASS) + KERNEL_LOCK(); + rv = pf_reassemble(&pd->m, pd->dir, reason); + KERNEL_UNLOCK(); + if (rv != PF_PASS) return (PF_DROP); if (pd->m == NULL) return (PF_PASS); /* packet has been reassembled, no error */ @@ -814,6 +818,7 @@ int pf_normalize_ip6(struct pf_pdesc *pd, u_short *reason) { struct ip6_frag frag; + int rv; if (pd->fragoff == 0) goto no_fragment; @@ -826,8 +831,11 @@ pf_normalize_ip6(struct pf_pdesc *pd, u_ return (PF_PASS); /* no reassembly */ /* Returns PF_DROP or m is NULL or completely reassembled mbuf */ - if (pf_reassemble6(&pd->m, &frag, pd->fragoff + sizeof(frag), - pd->extoff, pd->dir, reason) != PF_PASS) + KERNEL_LOCK(); + rv = pf_reassemble6(&pd->m, &frag, pd->fragoff + sizeof(frag), + pd->extoff, pd->dir, reason); + KERNEL_UNLOCK(); + if (rv != PF_PASS); return (PF_DROP); if (pd->m == NULL) return (PF_PASS); /* packet has been reassembled, no error */ Index: net/pfvar.h =================================================================== RCS file: /cvs/src/sys/net/pfvar.h,v retrieving revision 1.420 diff -u -p -r1.420 pfvar.h --- net/pfvar.h 19 Aug 2015 21:22:41 -0000 1.420 +++ net/pfvar.h 4 Sep 2015 12:53:04 -0000 @@ -1909,5 +1909,18 @@ void pf_cksum(struct pf_pdesc *, stru #endif /* _KERNEL */ +#define PF_LOCK(_have_lock_) do { \ + if ((_have_lock_) == 0) { \ + KERNEL_LOCK(); \ + (_have_lock_) = 1; \ + } \ + } while (0) + +#define PF_UNLOCK(_have_lock_) do { \ + if ((_have_lock_) != 0) { \ + KERNEL_UNLOCK(); \ + (_have_lock_) = 0; \ + } \ + } while (0) #endif /* _NET_PFVAR_H_ */ Index: sys/mbuf.h =================================================================== RCS file: /cvs/src/sys/sys/mbuf.h,v retrieving revision 1.196 diff -u -p -r1.196 mbuf.h --- sys/mbuf.h 14 Aug 2015 05:25:29 -0000 1.196 +++ sys/mbuf.h 4 Sep 2015 12:53:05 -0000 @@ -106,6 +106,7 @@ struct pkthdr_pf { /* pkthdr_pf.flags */ #define PF_TAG_GENERATED 0x01 +#define PF_TAG_KERNEL 0x02 /* packet runs as kernel-thr */ #define PF_TAG_TRANSLATE_LOCALHOST 0x04 #define PF_TAG_DIVERTED 0x08 #define PF_TAG_DIVERTED_PACKET 0x10 @@ -115,7 +116,7 @@ struct pkthdr_pf { #ifdef _KERNEL #define MPF_BITS \ - ("\20\1GENERATED\3TRANSLATE_LOCALHOST\4DIVERTED\5DIVERTED_PACKET" \ + ("\20\1GENERATED\2KERNEL\3TRANSLATE_LOCALHOST\4DIVERTED\5DIVERTED_PACKET" \ "\6REROUTE\7REFRAGMENTED\10PROCESSED") #endif