Hello David, </snip>
> the scope of the pf locks likely needs reduction anyway. one of my I agree. pf_lock covers too much in PF currently. it protects, all rules, tables and fragment caches. > production firewalls panicked with the pf lock trying to lock against > itself a couple of nights ago: > > db_enter() at db_enter+0x5 > panic(ffffffff81d47212) at panic+0x12a > rw_enter(ffffffff82060e70,1) at rw_enter+0x261 > pf_test(18,2,ffff80000158f000,ffff800024d03db8) at pf_test+0x118c > ip6_output(fffffd80049a3a00,0,0,0,ffff800024d03eb0,0) at > ip6_output+0xd33 > nd6_ns_output(ffff80000158f000,0,ffff800024d04228,fffffd8279b3b420,0) at > nd6_ns > _output+0x3e2 > nd6_resolve(ffff80000158f000,fffffd816c6b2ae0,fffffd80659d8300,ffff800024d04220 > ,ffff800024d040d8) at nd6_resolve+0x29d > ether_resolve(ffff80000158f000,fffffd80659d8300,ffff800024d04220,fffffd816c6b2a > e0,ffff800024d040d8) at ether_resolve+0x127 > ether_output(ffff80000158f000,fffffd80659d8300,ffff800024d04220,fffffd816c6b2ae > 0) at ether_output+0x2a > ip6_output(fffffd80659d8300,0,0,0,0,0) at ip6_output+0x1180 > pfsync_undefer_notify(fffffd841dbec7b8) at pfsync_undefer_notify+0xac > pfsync_undefer(fffffd841dbec7b8,0) at pfsync_undefer+0x8d > pfsync_defer(fffffd82303cb310,fffffd8065a20000) at pfsync_defer+0xfe > pf_test_rule(ffff800024d04600,ffff800024d045e8,ffff800024d045e0,ffff800024d045f > 0,ffff800024d045d8,ffff800024d045fe) at pf_test_rule+0x693 > pf_test(2,3,ffff8000015a9000,ffff800024d04798) at pf_test+0x10f1 > ip_output(fffffd8065a20000,0,ffff800024d04850,1,0,0) at ip_output+0x829 > ip_forward(fffffd8065a20000,ffff800001541800,fffffd817a7722d0,0) at > ip_forward+ > 0x27a > ip_input_if(ffff800024d04a80,ffff800024d04a7c,4,0,ffff800001541800) at > ip_input > _if+0x5fd > ipv4_input(ffff800001541800,fffffd8065a20000) at ipv4_input+0x37 > carp_input(ffff80000158f000,fffffd8065a20000,5e000158) at > carp_input+0x1ac > ether_input(ffff80000158f000,fffffd8065a20000) at ether_input+0x1c0 > vlan_input(ffff80000152f000,fffffd8065a20000) at vlan_input+0x19a > ether_input(ffff80000152f000,fffffd8065a20000) at ether_input+0x76 > if_input_process(ffff8000001df048,ffff800024d04ca8) at > if_input_process+0x5a > ifiq_process(ffff8000001dbe00) at ifiq_process+0x6f > taskq_thread(ffff80000002b080) at taskq_thread+0x7b > end trace frame: 0x0, count: -26 > diff below postpones call to pfsync_undefer() to moment, when PF_LOCK() is released. I believe this should fix the panic you see. the change does not look nice. and can be later reverted, when pf_lock scope will be reduced. I took a closer look at pfsync_defer() here: 1941 1942 if (sc->sc_deferred >= 128) { 1943 mtx_enter(&sc->sc_deferrals_mtx); 1944 pd = TAILQ_FIRST(&sc->sc_deferrals); 1945 TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry); 1946 sc->sc_deferred--; 1947 mtx_leave(&sc->sc_deferrals_mtx); 1948 if (timeout_del(&pd->pd_tmo)) 1949 pfsync_undefer(pd, 0); 1950 } 1951 1952 pd = pool_get(&sc->sc_pool, M_NOWAIT); 1953 if (pd == NULL) 1954 return (0); it seems to me we may leak `pd` we took at line 1944. The leak happens in case timeout_del() return zero. diff below ignores the return value from timeout_del() and makes sure we always call pfsync_undefefer() I have not seen such panic in my test environment. Would you be able to give my diff a try? thanks a lot regards sashan --------8<---------------8<---------------8<------------------8<-------- diff --git a/sys/net/if_pfsync.c b/sys/net/if_pfsync.c index b90aa934de4..a5e5e1ae0f3 100644 --- a/sys/net/if_pfsync.c +++ b/sys/net/if_pfsync.c @@ -272,7 +272,6 @@ void pfsync_syncdev_state(void *); void pfsync_ifdetach(void *); void pfsync_deferred(struct pf_state *, int); -void pfsync_undefer(struct pfsync_deferral *, int); void pfsync_defer_tmo(void *); void pfsync_cancel_full_update(struct pfsync_softc *); @@ -1927,7 +1926,7 @@ pfsync_insert_state(struct pf_state *st) } int -pfsync_defer(struct pf_state *st, struct mbuf *m) +pfsync_defer(struct pf_state *st, struct mbuf *m, struct pfsync_deferral **ppd) { struct pfsync_softc *sc = pfsyncif; struct pfsync_deferral *pd; @@ -1939,19 +1938,29 @@ pfsync_defer(struct pf_state *st, struct mbuf *m) m->m_flags & (M_BCAST|M_MCAST)) return (0); + pd = pool_get(&sc->sc_pool, M_NOWAIT); + if (pd == NULL) + return (0); + + /* + * deferral queue grows faster, than timeout can consume, + * we have to ask packet (caller) to help timer and dispatch + * one deferral for us. + * + * We wish to call pfsync_undefer() here. Unfortunately we can't, + * because pfsync_undefer() will be calling to ip_output(), + * which in turn will call to pf_test(), which would then attempt + * to grab PF_LOCK() we currently hold. + */ if (sc->sc_deferred >= 128) { mtx_enter(&sc->sc_deferrals_mtx); - pd = TAILQ_FIRST(&sc->sc_deferrals); - TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry); + *ppd = TAILQ_FIRST(&sc->sc_deferrals); + TAILQ_REMOVE(&sc->sc_deferrals, *ppd, pd_entry); sc->sc_deferred--; mtx_leave(&sc->sc_deferrals_mtx); - if (timeout_del(&pd->pd_tmo)) - pfsync_undefer(pd, 0); - } - - pd = pool_get(&sc->sc_pool, M_NOWAIT); - if (pd == NULL) - return (0); + timeout_del(&pd->pd_tmo); + } else + *ppd = NULL; m->m_pkthdr.pf.flags |= PF_TAG_GENERATED; SET(st->state_flags, PFSTATE_ACK); diff --git a/sys/net/if_pfsync.h b/sys/net/if_pfsync.h index 2520c87a36c..037219537e8 100644 --- a/sys/net/if_pfsync.h +++ b/sys/net/if_pfsync.h @@ -297,6 +297,8 @@ enum pfsync_counters { extern struct cpumem *pfsynccounters; +struct pfsync_deferral; + static inline void pfsyncstat_inc(enum pfsync_counters c) { @@ -335,7 +337,9 @@ void pfsync_clear_states(u_int32_t, const char *); void pfsync_update_tdb(struct tdb *, int); void pfsync_delete_tdb(struct tdb *); -int pfsync_defer(struct pf_state *, struct mbuf *); +int pfsync_defer(struct pf_state *, struct mbuf *, + struct pfsync_deferral **); +void pfsync_undefer(struct pfsync_deferral *, int); int pfsync_up(void); int pfsync_state_in_use(struct pf_state *); diff --git a/sys/net/pf.c b/sys/net/pf.c index b684c4b037a..5fe5fcaad70 100644 --- a/sys/net/pf.c +++ b/sys/net/pf.c @@ -194,7 +194,8 @@ void pf_rule_to_actions(struct pf_rule *, struct pf_rule_actions *); int pf_test_rule(struct pf_pdesc *, struct pf_rule **, struct pf_state **, struct pf_rule **, - struct pf_ruleset **, u_short *); + struct pf_ruleset **, u_short *, + void **); static __inline int pf_create_state(struct pf_pdesc *, struct pf_rule *, struct pf_rule *, struct pf_rule *, struct pf_state_key **, struct pf_state_key **, @@ -3733,7 +3734,8 @@ pf_match_rule(struct pf_test_ctx *ctx, struct pf_ruleset *ruleset) int pf_test_rule(struct pf_pdesc *pd, struct pf_rule **rm, struct pf_state **sm, - struct pf_rule **am, struct pf_ruleset **rsm, u_short *reason) + struct pf_rule **am, struct pf_ruleset **rsm, u_short *reason, + void **pdeferral) { struct pf_rule *r = NULL; struct pf_rule *a = NULL; @@ -3966,7 +3968,8 @@ pf_test_rule(struct pf_pdesc *pd, struct pf_rule **rm, struct pf_state **sm, * firewall has to know about it to allow * replies through it. */ - if (pfsync_defer(*sm, pd->m)) + if (pfsync_defer(*sm, pd->m, + (struct pfsync_deferral **)pdeferral)) return (PF_DEFER); } #endif /* NPFSYNC > 0 */ @@ -6815,6 +6818,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0) int dir = (fwdir == PF_FWD) ? PF_OUT : fwdir; u_int32_t qid, pqid = 0; int have_pf_lock = 0; + void *deferral = NULL; if (!pf_status.running) return (PF_PASS); @@ -6917,7 +6921,8 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0) */ PF_LOCK(); have_pf_lock = 1; - action = pf_test_rule(&pd, &r, &s, &a, &ruleset, &reason); + action = pf_test_rule(&pd, &r, &s, &a, &ruleset, &reason, + &deferral); s = pf_state_ref(s); if (action != PF_PASS) REASON_SET(&reason, PFRES_FRAG); @@ -6949,7 +6954,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0) PF_LOCK(); have_pf_lock = 1; action = pf_test_rule(&pd, &r, &s, &a, &ruleset, - &reason); + &reason, &deferral); s = pf_state_ref(s); } break; @@ -6981,7 +6986,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0) PF_LOCK(); have_pf_lock = 1; action = pf_test_rule(&pd, &r, &s, &a, &ruleset, - &reason); + &reason, &deferral); s = pf_state_ref(s); } break; @@ -7057,7 +7062,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0) PF_LOCK(); have_pf_lock = 1; action = pf_test_rule(&pd, &r, &s, &a, &ruleset, - &reason); + &reason, &deferral); s = pf_state_ref(s); } @@ -7193,6 +7198,14 @@ done: m_freem(pd.m); /* FALLTHROUGH */ case PF_DEFER: +#if NPFSYNC > 0 + /* + * We no longer hold PF_LOCK() here, so we can dispatch + * deferral if we are asked to do so. + */ + if (deferral != NULL) + pfsync_undefer(deferral, 0); +#endif /* NPFSYNC > 0 */ pd.m = NULL; action = PF_PASS; break;