On Sat, Jun 05, 2021 at 01:09:01PM +0200, Alexandr Nedvedicky wrote: > Hello David, > > </snip> > > > the scope of the pf locks likely needs reduction anyway. one of my > > I agree. pf_lock covers too much in PF currently. it protects, > all rules, tables and fragment caches. > > > production firewalls panicked with the pf lock trying to lock against > > itself a couple of nights ago: > > > > db_enter() at db_enter+0x5 > > panic(ffffffff81d47212) at panic+0x12a > > rw_enter(ffffffff82060e70,1) at rw_enter+0x261 > > pf_test(18,2,ffff80000158f000,ffff800024d03db8) at pf_test+0x118c > > ip6_output(fffffd80049a3a00,0,0,0,ffff800024d03eb0,0) at > > ip6_output+0xd33 > > nd6_ns_output(ffff80000158f000,0,ffff800024d04228,fffffd8279b3b420,0) at > > nd6_ns > > _output+0x3e2 > > nd6_resolve(ffff80000158f000,fffffd816c6b2ae0,fffffd80659d8300,ffff800024d04220 > > ,ffff800024d040d8) at nd6_resolve+0x29d > > ether_resolve(ffff80000158f000,fffffd80659d8300,ffff800024d04220,fffffd816c6b2a > > e0,ffff800024d040d8) at ether_resolve+0x127 > > ether_output(ffff80000158f000,fffffd80659d8300,ffff800024d04220,fffffd816c6b2ae > > 0) at ether_output+0x2a > > ip6_output(fffffd80659d8300,0,0,0,0,0) at ip6_output+0x1180 > > pfsync_undefer_notify(fffffd841dbec7b8) at pfsync_undefer_notify+0xac > > pfsync_undefer(fffffd841dbec7b8,0) at pfsync_undefer+0x8d > > pfsync_defer(fffffd82303cb310,fffffd8065a20000) at pfsync_defer+0xfe > > pf_test_rule(ffff800024d04600,ffff800024d045e8,ffff800024d045e0,ffff800024d045f > > 0,ffff800024d045d8,ffff800024d045fe) at pf_test_rule+0x693 > > pf_test(2,3,ffff8000015a9000,ffff800024d04798) at pf_test+0x10f1 > > ip_output(fffffd8065a20000,0,ffff800024d04850,1,0,0) at ip_output+0x829 > > ip_forward(fffffd8065a20000,ffff800001541800,fffffd817a7722d0,0) at > > ip_forward+ > > 0x27a > > ip_input_if(ffff800024d04a80,ffff800024d04a7c,4,0,ffff800001541800) at > > ip_input > > _if+0x5fd > > ipv4_input(ffff800001541800,fffffd8065a20000) at ipv4_input+0x37 > > carp_input(ffff80000158f000,fffffd8065a20000,5e000158) at > > carp_input+0x1ac > > ether_input(ffff80000158f000,fffffd8065a20000) at ether_input+0x1c0 > > vlan_input(ffff80000152f000,fffffd8065a20000) at vlan_input+0x19a > > ether_input(ffff80000152f000,fffffd8065a20000) at ether_input+0x76 > > if_input_process(ffff8000001df048,ffff800024d04ca8) at > > if_input_process+0x5a > > ifiq_process(ffff8000001dbe00) at ifiq_process+0x6f > > taskq_thread(ffff80000002b080) at taskq_thread+0x7b > > end trace frame: 0x0, count: -26 > > > > diff below postpones call to pfsync_undefer() to moment, when PF_LOCK() is > released. I believe this should fix the panic you see. the change does not > look nice. and can be later reverted, when pf_lock scope will be reduced. > > I took a closer look at pfsync_defer() here: > > 1941 > 1942 if (sc->sc_deferred >= 128) { > 1943 mtx_enter(&sc->sc_deferrals_mtx); > 1944 pd = TAILQ_FIRST(&sc->sc_deferrals); > 1945 TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry); > 1946 sc->sc_deferred--; > 1947 mtx_leave(&sc->sc_deferrals_mtx); > 1948 if (timeout_del(&pd->pd_tmo)) > 1949 pfsync_undefer(pd, 0); > 1950 } > 1951 > 1952 pd = pool_get(&sc->sc_pool, M_NOWAIT); > 1953 if (pd == NULL) > 1954 return (0); > > it seems to me we may leak `pd` we took at line 1944. The leak > happens in case timeout_del() return zero. > > diff below ignores the return value from timeout_del() and makes > sure we always call pfsync_undefefer() > > I have not seen such panic in my test environment. Would you be able > to give my diff a try?
i hit a different fault in this code recently, so i've been looking at the code again. my fix for the second fault will conflict with this one. i'll send it out to tech@ separately though. your diff looks like it moves the handling of undefer until after pf drops its locks. i was just going to make pfsync schedule a task or softint immediately if the queue was getting long, and then stop doing deferrals if it was way too long. > --------8<---------------8<---------------8<------------------8<-------- > diff --git a/sys/net/if_pfsync.c b/sys/net/if_pfsync.c > index b90aa934de4..a5e5e1ae0f3 100644 > --- a/sys/net/if_pfsync.c > +++ b/sys/net/if_pfsync.c > @@ -272,7 +272,6 @@ void pfsync_syncdev_state(void *); > void pfsync_ifdetach(void *); > > void pfsync_deferred(struct pf_state *, int); > -void pfsync_undefer(struct pfsync_deferral *, int); > void pfsync_defer_tmo(void *); > > void pfsync_cancel_full_update(struct pfsync_softc *); > @@ -1927,7 +1926,7 @@ pfsync_insert_state(struct pf_state *st) > } > > int > -pfsync_defer(struct pf_state *st, struct mbuf *m) > +pfsync_defer(struct pf_state *st, struct mbuf *m, struct pfsync_deferral > **ppd) > { > struct pfsync_softc *sc = pfsyncif; > struct pfsync_deferral *pd; > @@ -1939,19 +1938,29 @@ pfsync_defer(struct pf_state *st, struct mbuf *m) > m->m_flags & (M_BCAST|M_MCAST)) > return (0); > > + pd = pool_get(&sc->sc_pool, M_NOWAIT); > + if (pd == NULL) > + return (0); > + > + /* > + * deferral queue grows faster, than timeout can consume, > + * we have to ask packet (caller) to help timer and dispatch > + * one deferral for us. > + * > + * We wish to call pfsync_undefer() here. Unfortunately we can't, > + * because pfsync_undefer() will be calling to ip_output(), > + * which in turn will call to pf_test(), which would then attempt > + * to grab PF_LOCK() we currently hold. > + */ > if (sc->sc_deferred >= 128) { > mtx_enter(&sc->sc_deferrals_mtx); > - pd = TAILQ_FIRST(&sc->sc_deferrals); > - TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry); > + *ppd = TAILQ_FIRST(&sc->sc_deferrals); > + TAILQ_REMOVE(&sc->sc_deferrals, *ppd, pd_entry); > sc->sc_deferred--; > mtx_leave(&sc->sc_deferrals_mtx); > - if (timeout_del(&pd->pd_tmo)) > - pfsync_undefer(pd, 0); > - } > - > - pd = pool_get(&sc->sc_pool, M_NOWAIT); > - if (pd == NULL) > - return (0); > + timeout_del(&pd->pd_tmo); > + } else > + *ppd = NULL; > > m->m_pkthdr.pf.flags |= PF_TAG_GENERATED; > SET(st->state_flags, PFSTATE_ACK); > diff --git a/sys/net/if_pfsync.h b/sys/net/if_pfsync.h > index 2520c87a36c..037219537e8 100644 > --- a/sys/net/if_pfsync.h > +++ b/sys/net/if_pfsync.h > @@ -297,6 +297,8 @@ enum pfsync_counters { > > extern struct cpumem *pfsynccounters; > > +struct pfsync_deferral; > + > static inline void > pfsyncstat_inc(enum pfsync_counters c) > { > @@ -335,7 +337,9 @@ void pfsync_clear_states(u_int32_t, > const char *); > void pfsync_update_tdb(struct tdb *, int); > void pfsync_delete_tdb(struct tdb *); > > -int pfsync_defer(struct pf_state *, struct mbuf *); > +int pfsync_defer(struct pf_state *, struct mbuf *, > + struct pfsync_deferral **); > +void pfsync_undefer(struct pfsync_deferral *, int); > > int pfsync_up(void); > int pfsync_state_in_use(struct pf_state *); > diff --git a/sys/net/pf.c b/sys/net/pf.c > index b684c4b037a..5fe5fcaad70 100644 > --- a/sys/net/pf.c > +++ b/sys/net/pf.c > @@ -194,7 +194,8 @@ void pf_rule_to_actions(struct > pf_rule *, > struct pf_rule_actions *); > int pf_test_rule(struct pf_pdesc *, struct pf_rule **, > struct pf_state **, struct pf_rule **, > - struct pf_ruleset **, u_short *); > + struct pf_ruleset **, u_short *, > + void **); > static __inline int pf_create_state(struct pf_pdesc *, struct pf_rule *, > struct pf_rule *, struct pf_rule *, > struct pf_state_key **, struct pf_state_key **, > @@ -3733,7 +3734,8 @@ pf_match_rule(struct pf_test_ctx *ctx, struct > pf_ruleset *ruleset) > > int > pf_test_rule(struct pf_pdesc *pd, struct pf_rule **rm, struct pf_state **sm, > - struct pf_rule **am, struct pf_ruleset **rsm, u_short *reason) > + struct pf_rule **am, struct pf_ruleset **rsm, u_short *reason, > + void **pdeferral) > { > struct pf_rule *r = NULL; > struct pf_rule *a = NULL; > @@ -3966,7 +3968,8 @@ pf_test_rule(struct pf_pdesc *pd, struct pf_rule **rm, > struct pf_state **sm, > * firewall has to know about it to allow > * replies through it. > */ > - if (pfsync_defer(*sm, pd->m)) > + if (pfsync_defer(*sm, pd->m, > + (struct pfsync_deferral **)pdeferral)) > return (PF_DEFER); > } > #endif /* NPFSYNC > 0 */ > @@ -6815,6 +6818,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, > struct mbuf **m0) > int dir = (fwdir == PF_FWD) ? PF_OUT : fwdir; > u_int32_t qid, pqid = 0; > int have_pf_lock = 0; > + void *deferral = NULL; > > if (!pf_status.running) > return (PF_PASS); > @@ -6917,7 +6921,8 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, > struct mbuf **m0) > */ > PF_LOCK(); > have_pf_lock = 1; > - action = pf_test_rule(&pd, &r, &s, &a, &ruleset, &reason); > + action = pf_test_rule(&pd, &r, &s, &a, &ruleset, &reason, > + &deferral); > s = pf_state_ref(s); > if (action != PF_PASS) > REASON_SET(&reason, PFRES_FRAG); > @@ -6949,7 +6954,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, > struct mbuf **m0) > PF_LOCK(); > have_pf_lock = 1; > action = pf_test_rule(&pd, &r, &s, &a, &ruleset, > - &reason); > + &reason, &deferral); > s = pf_state_ref(s); > } > break; > @@ -6981,7 +6986,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, > struct mbuf **m0) > PF_LOCK(); > have_pf_lock = 1; > action = pf_test_rule(&pd, &r, &s, &a, &ruleset, > - &reason); > + &reason, &deferral); > s = pf_state_ref(s); > } > break; > @@ -7057,7 +7062,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, > struct mbuf **m0) > PF_LOCK(); > have_pf_lock = 1; > action = pf_test_rule(&pd, &r, &s, &a, &ruleset, > - &reason); > + &reason, &deferral); > s = pf_state_ref(s); > } > > @@ -7193,6 +7198,14 @@ done: > m_freem(pd.m); > /* FALLTHROUGH */ > case PF_DEFER: > +#if NPFSYNC > 0 > + /* > + * We no longer hold PF_LOCK() here, so we can dispatch > + * deferral if we are asked to do so. > + */ > + if (deferral != NULL) > + pfsync_undefer(deferral, 0); > +#endif /* NPFSYNC > 0 */ > pd.m = NULL; > action = PF_PASS; > break;