Hello David,
</snip>
> the scope of the pf locks likely needs reduction anyway. one of my
I agree. pf_lock covers too much in PF currently. it protects,
all rules, tables and fragment caches.
> production firewalls panicked with the pf lock trying to lock against
> itself a couple of nights ago:
>
> db_enter() at db_enter+0x5
> panic(ffffffff81d47212) at panic+0x12a
> rw_enter(ffffffff82060e70,1) at rw_enter+0x261
> pf_test(18,2,ffff80000158f000,ffff800024d03db8) at pf_test+0x118c
> ip6_output(fffffd80049a3a00,0,0,0,ffff800024d03eb0,0) at
> ip6_output+0xd33
> nd6_ns_output(ffff80000158f000,0,ffff800024d04228,fffffd8279b3b420,0) at
> nd6_ns
> _output+0x3e2
> nd6_resolve(ffff80000158f000,fffffd816c6b2ae0,fffffd80659d8300,ffff800024d04220
> ,ffff800024d040d8) at nd6_resolve+0x29d
> ether_resolve(ffff80000158f000,fffffd80659d8300,ffff800024d04220,fffffd816c6b2a
> e0,ffff800024d040d8) at ether_resolve+0x127
> ether_output(ffff80000158f000,fffffd80659d8300,ffff800024d04220,fffffd816c6b2ae
> 0) at ether_output+0x2a
> ip6_output(fffffd80659d8300,0,0,0,0,0) at ip6_output+0x1180
> pfsync_undefer_notify(fffffd841dbec7b8) at pfsync_undefer_notify+0xac
> pfsync_undefer(fffffd841dbec7b8,0) at pfsync_undefer+0x8d
> pfsync_defer(fffffd82303cb310,fffffd8065a20000) at pfsync_defer+0xfe
> pf_test_rule(ffff800024d04600,ffff800024d045e8,ffff800024d045e0,ffff800024d045f
> 0,ffff800024d045d8,ffff800024d045fe) at pf_test_rule+0x693
> pf_test(2,3,ffff8000015a9000,ffff800024d04798) at pf_test+0x10f1
> ip_output(fffffd8065a20000,0,ffff800024d04850,1,0,0) at ip_output+0x829
> ip_forward(fffffd8065a20000,ffff800001541800,fffffd817a7722d0,0) at
> ip_forward+
> 0x27a
> ip_input_if(ffff800024d04a80,ffff800024d04a7c,4,0,ffff800001541800) at
> ip_input
> _if+0x5fd
> ipv4_input(ffff800001541800,fffffd8065a20000) at ipv4_input+0x37
> carp_input(ffff80000158f000,fffffd8065a20000,5e000158) at
> carp_input+0x1ac
> ether_input(ffff80000158f000,fffffd8065a20000) at ether_input+0x1c0
> vlan_input(ffff80000152f000,fffffd8065a20000) at vlan_input+0x19a
> ether_input(ffff80000152f000,fffffd8065a20000) at ether_input+0x76
> if_input_process(ffff8000001df048,ffff800024d04ca8) at
> if_input_process+0x5a
> ifiq_process(ffff8000001dbe00) at ifiq_process+0x6f
> taskq_thread(ffff80000002b080) at taskq_thread+0x7b
> end trace frame: 0x0, count: -26
>
diff below postpones call to pfsync_undefer() to moment, when PF_LOCK() is
released. I believe this should fix the panic you see. the change does not
look nice. and can be later reverted, when pf_lock scope will be reduced.
I took a closer look at pfsync_defer() here:
1941
1942 if (sc->sc_deferred >= 128) {
1943 mtx_enter(&sc->sc_deferrals_mtx);
1944 pd = TAILQ_FIRST(&sc->sc_deferrals);
1945 TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
1946 sc->sc_deferred--;
1947 mtx_leave(&sc->sc_deferrals_mtx);
1948 if (timeout_del(&pd->pd_tmo))
1949 pfsync_undefer(pd, 0);
1950 }
1951
1952 pd = pool_get(&sc->sc_pool, M_NOWAIT);
1953 if (pd == NULL)
1954 return (0);
it seems to me we may leak `pd` we took at line 1944. The leak
happens in case timeout_del() return zero.
diff below ignores the return value from timeout_del() and makes
sure we always call pfsync_undefefer()
I have not seen such panic in my test environment. Would you be able
to give my diff a try?
thanks a lot
regards
sashan
--------8<---------------8<---------------8<------------------8<--------
diff --git a/sys/net/if_pfsync.c b/sys/net/if_pfsync.c
index b90aa934de4..a5e5e1ae0f3 100644
--- a/sys/net/if_pfsync.c
+++ b/sys/net/if_pfsync.c
@@ -272,7 +272,6 @@ void pfsync_syncdev_state(void *);
void pfsync_ifdetach(void *);
void pfsync_deferred(struct pf_state *, int);
-void pfsync_undefer(struct pfsync_deferral *, int);
void pfsync_defer_tmo(void *);
void pfsync_cancel_full_update(struct pfsync_softc *);
@@ -1927,7 +1926,7 @@ pfsync_insert_state(struct pf_state *st)
}
int
-pfsync_defer(struct pf_state *st, struct mbuf *m)
+pfsync_defer(struct pf_state *st, struct mbuf *m, struct pfsync_deferral **ppd)
{
struct pfsync_softc *sc = pfsyncif;
struct pfsync_deferral *pd;
@@ -1939,19 +1938,29 @@ pfsync_defer(struct pf_state *st, struct mbuf *m)
m->m_flags & (M_BCAST|M_MCAST))
return (0);
+ pd = pool_get(&sc->sc_pool, M_NOWAIT);
+ if (pd == NULL)
+ return (0);
+
+ /*
+ * deferral queue grows faster, than timeout can consume,
+ * we have to ask packet (caller) to help timer and dispatch
+ * one deferral for us.
+ *
+ * We wish to call pfsync_undefer() here. Unfortunately we can't,
+ * because pfsync_undefer() will be calling to ip_output(),
+ * which in turn will call to pf_test(), which would then attempt
+ * to grab PF_LOCK() we currently hold.
+ */
if (sc->sc_deferred >= 128) {
mtx_enter(&sc->sc_deferrals_mtx);
- pd = TAILQ_FIRST(&sc->sc_deferrals);
- TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
+ *ppd = TAILQ_FIRST(&sc->sc_deferrals);
+ TAILQ_REMOVE(&sc->sc_deferrals, *ppd, pd_entry);
sc->sc_deferred--;
mtx_leave(&sc->sc_deferrals_mtx);
- if (timeout_del(&pd->pd_tmo))
- pfsync_undefer(pd, 0);
- }
-
- pd = pool_get(&sc->sc_pool, M_NOWAIT);
- if (pd == NULL)
- return (0);
+ timeout_del(&pd->pd_tmo);
+ } else
+ *ppd = NULL;
m->m_pkthdr.pf.flags |= PF_TAG_GENERATED;
SET(st->state_flags, PFSTATE_ACK);
diff --git a/sys/net/if_pfsync.h b/sys/net/if_pfsync.h
index 2520c87a36c..037219537e8 100644
--- a/sys/net/if_pfsync.h
+++ b/sys/net/if_pfsync.h
@@ -297,6 +297,8 @@ enum pfsync_counters {
extern struct cpumem *pfsynccounters;
+struct pfsync_deferral;
+
static inline void
pfsyncstat_inc(enum pfsync_counters c)
{
@@ -335,7 +337,9 @@ void pfsync_clear_states(u_int32_t,
const char *);
void pfsync_update_tdb(struct tdb *, int);
void pfsync_delete_tdb(struct tdb *);
-int pfsync_defer(struct pf_state *, struct mbuf *);
+int pfsync_defer(struct pf_state *, struct mbuf *,
+ struct pfsync_deferral **);
+void pfsync_undefer(struct pfsync_deferral *, int);
int pfsync_up(void);
int pfsync_state_in_use(struct pf_state *);
diff --git a/sys/net/pf.c b/sys/net/pf.c
index b684c4b037a..5fe5fcaad70 100644
--- a/sys/net/pf.c
+++ b/sys/net/pf.c
@@ -194,7 +194,8 @@ void pf_rule_to_actions(struct
pf_rule *,
struct pf_rule_actions *);
int pf_test_rule(struct pf_pdesc *, struct pf_rule **,
struct pf_state **, struct pf_rule **,
- struct pf_ruleset **, u_short *);
+ struct pf_ruleset **, u_short *,
+ void **);
static __inline int pf_create_state(struct pf_pdesc *, struct pf_rule *,
struct pf_rule *, struct pf_rule *,
struct pf_state_key **, struct pf_state_key **,
@@ -3733,7 +3734,8 @@ pf_match_rule(struct pf_test_ctx *ctx, struct pf_ruleset
*ruleset)
int
pf_test_rule(struct pf_pdesc *pd, struct pf_rule **rm, struct pf_state **sm,
- struct pf_rule **am, struct pf_ruleset **rsm, u_short *reason)
+ struct pf_rule **am, struct pf_ruleset **rsm, u_short *reason,
+ void **pdeferral)
{
struct pf_rule *r = NULL;
struct pf_rule *a = NULL;
@@ -3966,7 +3968,8 @@ pf_test_rule(struct pf_pdesc *pd, struct pf_rule **rm,
struct pf_state **sm,
* firewall has to know about it to allow
* replies through it.
*/
- if (pfsync_defer(*sm, pd->m))
+ if (pfsync_defer(*sm, pd->m,
+ (struct pfsync_deferral **)pdeferral))
return (PF_DEFER);
}
#endif /* NPFSYNC > 0 */
@@ -6815,6 +6818,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp,
struct mbuf **m0)
int dir = (fwdir == PF_FWD) ? PF_OUT : fwdir;
u_int32_t qid, pqid = 0;
int have_pf_lock = 0;
+ void *deferral = NULL;
if (!pf_status.running)
return (PF_PASS);
@@ -6917,7 +6921,8 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp,
struct mbuf **m0)
*/
PF_LOCK();
have_pf_lock = 1;
- action = pf_test_rule(&pd, &r, &s, &a, &ruleset, &reason);
+ action = pf_test_rule(&pd, &r, &s, &a, &ruleset, &reason,
+ &deferral);
s = pf_state_ref(s);
if (action != PF_PASS)
REASON_SET(&reason, PFRES_FRAG);
@@ -6949,7 +6954,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp,
struct mbuf **m0)
PF_LOCK();
have_pf_lock = 1;
action = pf_test_rule(&pd, &r, &s, &a, &ruleset,
- &reason);
+ &reason, &deferral);
s = pf_state_ref(s);
}
break;
@@ -6981,7 +6986,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp,
struct mbuf **m0)
PF_LOCK();
have_pf_lock = 1;
action = pf_test_rule(&pd, &r, &s, &a, &ruleset,
- &reason);
+ &reason, &deferral);
s = pf_state_ref(s);
}
break;
@@ -7057,7 +7062,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp,
struct mbuf **m0)
PF_LOCK();
have_pf_lock = 1;
action = pf_test_rule(&pd, &r, &s, &a, &ruleset,
- &reason);
+ &reason, &deferral);
s = pf_state_ref(s);
}
@@ -7193,6 +7198,14 @@ done:
m_freem(pd.m);
/* FALLTHROUGH */
case PF_DEFER:
+#if NPFSYNC > 0
+ /*
+ * We no longer hold PF_LOCK() here, so we can dispatch
+ * deferral if we are asked to do so.
+ */
+ if (deferral != NULL)
+ pfsync_undefer(deferral, 0);
+#endif /* NPFSYNC > 0 */
pd.m = NULL;
action = PF_PASS;
break;