Hello,
re-sending the same diff updated to current if_pfsync.c.
diff avoids a recursion to PF_LOCK(), which is indicated
by panic stack here:
</snip>
> > production firewalls panicked with the pf lock trying to lock against
> > itself a couple of nights ago:
> >
> > db_enter() at db_enter+0x5
> > panic(ffffffff81d47212) at panic+0x12a
> > rw_enter(ffffffff82060e70,1) at rw_enter+0x261
> > pf_test(18,2,ffff80000158f000,ffff800024d03db8) at pf_test+0x118c
> > ip6_output(fffffd80049a3a00,0,0,0,ffff800024d03eb0,0) at
> > ip6_output+0xd33
> > nd6_ns_output(ffff80000158f000,0,ffff800024d04228,fffffd8279b3b420,0) at
> > nd6_ns
> > _output+0x3e2
> > nd6_resolve(ffff80000158f000,fffffd816c6b2ae0,fffffd80659d8300,ffff800024d04220
> > ,ffff800024d040d8) at nd6_resolve+0x29d
> > ether_resolve(ffff80000158f000,fffffd80659d8300,ffff800024d04220,fffffd816c6b2a
> > e0,ffff800024d040d8) at ether_resolve+0x127
> > ether_output(ffff80000158f000,fffffd80659d8300,ffff800024d04220,fffffd816c6b2ae
> > 0) at ether_output+0x2a
> > ip6_output(fffffd80659d8300,0,0,0,0,0) at ip6_output+0x1180
> > pfsync_undefer_notify(fffffd841dbec7b8) at pfsync_undefer_notify+0xac
> > pfsync_undefer(fffffd841dbec7b8,0) at pfsync_undefer+0x8d
> > pfsync_defer(fffffd82303cb310,fffffd8065a20000) at pfsync_defer+0xfe
> > pf_test_rule(ffff800024d04600,ffff800024d045e8,ffff800024d045e0,ffff800024d045f
> > 0,ffff800024d045d8,ffff800024d045fe) at pf_test_rule+0x693
> > pf_test(2,3,ffff8000015a9000,ffff800024d04798) at pf_test+0x10f1
> > ip_output(fffffd8065a20000,0,ffff800024d04850,1,0,0) at ip_output+0x829
> > ip_forward(fffffd8065a20000,ffff800001541800,fffffd817a7722d0,0) at
> > ip_forward+
> > 0x27a
> > ip_input_if(ffff800024d04a80,ffff800024d04a7c,4,0,ffff800001541800) at
> > ip_input
> > _if+0x5fd
> > ipv4_input(ffff800001541800,fffffd8065a20000) at ipv4_input+0x37
> > carp_input(ffff80000158f000,fffffd8065a20000,5e000158) at
> > carp_input+0x1ac
> > ether_input(ffff80000158f000,fffffd8065a20000) at ether_input+0x1c0
> > vlan_input(ffff80000152f000,fffffd8065a20000) at vlan_input+0x19a
> > ether_input(ffff80000152f000,fffffd8065a20000) at ether_input+0x76
> > if_input_process(ffff8000001df048,ffff800024d04ca8) at
> > if_input_process+0x5a
> > ifiq_process(ffff8000001dbe00) at ifiq_process+0x6f
> > taskq_thread(ffff80000002b080) at taskq_thread+0x7b
> > end trace frame: 0x0, count: -26
> >
>
the recursion to PF_LOCK() is avoided by postponing the call to
pfsync_undefer() to very last phase of pf_test(), where we run
without holding PF_LOCK().
I believe we will be able to revert diff below once scope of PF_LOCK() in
pf_test() will be reduced in future. However future is not here yet. To reduce
PF_LOCK() scope in pf_test() we must give some love to pf's tables first.
comments? OKs?
thanks and
regards
sashan
--------8<---------------8<---------------8<------------------8<--------
diff --git a/sys/net/if_pfsync.c b/sys/net/if_pfsync.c
index f1d292d67ff..9e7dc5064b9 100644
--- a/sys/net/if_pfsync.c
+++ b/sys/net/if_pfsync.c
@@ -1931,7 +1931,7 @@ pfsync_insert_state(struct pf_state *st)
}
int
-pfsync_defer(struct pf_state *st, struct mbuf *m)
+pfsync_defer(struct pf_state *st, struct mbuf *m, struct pfsync_deferral **ppd)
{
struct pfsync_softc *sc = pfsyncif;
struct pfsync_deferral *pd;
@@ -1944,21 +1944,30 @@ pfsync_defer(struct pf_state *st, struct mbuf *m)
m->m_flags & (M_BCAST|M_MCAST))
return (0);
+ pd = pool_get(&sc->sc_pool, M_NOWAIT);
+ if (pd == NULL)
+ return (0);
+
+ /*
+ * deferral queue grows faster, than timeout can consume,
+ * we have to ask packet (caller) to help timer and dispatch
+ * one deferral for us.
+ *
+ * We wish to call pfsync_undefer() here. Unfortunately we can't,
+ * because pfsync_undefer() will be calling to ip_output(),
+ * which in turn will call to pf_test(), which would then attempt
+ * to grab PF_LOCK() we currently hold.
+ */
if (sc->sc_deferred >= 128) {
mtx_enter(&sc->sc_deferrals_mtx);
- pd = TAILQ_FIRST(&sc->sc_deferrals);
- if (pd != NULL) {
- TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
+ *ppd = TAILQ_FIRST(&sc->sc_deferrals);
+ if (*ppd != NULL) {
+ TAILQ_REMOVE(&sc->sc_deferrals, *ppd, pd_entry);
sc->sc_deferred--;
}
mtx_leave(&sc->sc_deferrals_mtx);
- if (pd != NULL)
- pfsync_undefer(pd, 0);
- }
-
- pd = pool_get(&sc->sc_pool, M_NOWAIT);
- if (pd == NULL)
- return (0);
+ } else
+ *ppd = NULL;
m->m_pkthdr.pf.flags |= PF_TAG_GENERATED;
SET(st->state_flags, PFSTATE_ACK);
diff --git a/sys/net/if_pfsync.h b/sys/net/if_pfsync.h
index 2520c87a36c..037219537e8 100644
--- a/sys/net/if_pfsync.h
+++ b/sys/net/if_pfsync.h
@@ -297,6 +297,8 @@ enum pfsync_counters {
extern struct cpumem *pfsynccounters;
+struct pfsync_deferral;
+
static inline void
pfsyncstat_inc(enum pfsync_counters c)
{
@@ -335,7 +337,9 @@ void pfsync_clear_states(u_int32_t,
const char *);
void pfsync_update_tdb(struct tdb *, int);
void pfsync_delete_tdb(struct tdb *);
-int pfsync_defer(struct pf_state *, struct mbuf *);
+int pfsync_defer(struct pf_state *, struct mbuf *,
+ struct pfsync_deferral **);
+void pfsync_undefer(struct pfsync_deferral *, int);
int pfsync_up(void);
int pfsync_state_in_use(struct pf_state *);
diff --git a/sys/net/pf.c b/sys/net/pf.c
index b684c4b037a..5fe5fcaad70 100644
--- a/sys/net/pf.c
+++ b/sys/net/pf.c
@@ -194,7 +194,8 @@ void pf_rule_to_actions(struct
pf_rule *,
struct pf_rule_actions *);
int pf_test_rule(struct pf_pdesc *, struct pf_rule **,
struct pf_state **, struct pf_rule **,
- struct pf_ruleset **, u_short *);
+ struct pf_ruleset **, u_short *,
+ void **);
static __inline int pf_create_state(struct pf_pdesc *, struct pf_rule *,
struct pf_rule *, struct pf_rule *,
struct pf_state_key **, struct pf_state_key **,
@@ -3733,7 +3734,8 @@ pf_match_rule(struct pf_test_ctx *ctx, struct pf_ruleset
*ruleset)
int
pf_test_rule(struct pf_pdesc *pd, struct pf_rule **rm, struct pf_state **sm,
- struct pf_rule **am, struct pf_ruleset **rsm, u_short *reason)
+ struct pf_rule **am, struct pf_ruleset **rsm, u_short *reason,
+ void **pdeferral)
{
struct pf_rule *r = NULL;
struct pf_rule *a = NULL;
@@ -3966,7 +3968,8 @@ pf_test_rule(struct pf_pdesc *pd, struct pf_rule **rm,
struct pf_state **sm,
* firewall has to know about it to allow
* replies through it.
*/
- if (pfsync_defer(*sm, pd->m))
+ if (pfsync_defer(*sm, pd->m,
+ (struct pfsync_deferral **)pdeferral))
return (PF_DEFER);
}
#endif /* NPFSYNC > 0 */
@@ -6815,6 +6818,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp,
struct mbuf **m0)
int dir = (fwdir == PF_FWD) ? PF_OUT : fwdir;
u_int32_t qid, pqid = 0;
int have_pf_lock = 0;
+ void *deferral = NULL;
if (!pf_status.running)
return (PF_PASS);
@@ -6917,7 +6921,8 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp,
struct mbuf **m0)
*/
PF_LOCK();
have_pf_lock = 1;
- action = pf_test_rule(&pd, &r, &s, &a, &ruleset, &reason);
+ action = pf_test_rule(&pd, &r, &s, &a, &ruleset, &reason,
+ &deferral);
s = pf_state_ref(s);
if (action != PF_PASS)
REASON_SET(&reason, PFRES_FRAG);
@@ -6949,7 +6954,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp,
struct mbuf **m0)
PF_LOCK();
have_pf_lock = 1;
action = pf_test_rule(&pd, &r, &s, &a, &ruleset,
- &reason);
+ &reason, &deferral);
s = pf_state_ref(s);
}
break;
@@ -6981,7 +6986,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp,
struct mbuf **m0)
PF_LOCK();
have_pf_lock = 1;
action = pf_test_rule(&pd, &r, &s, &a, &ruleset,
- &reason);
+ &reason, &deferral);
s = pf_state_ref(s);
}
break;
@@ -7057,7 +7062,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp,
struct mbuf **m0)
PF_LOCK();
have_pf_lock = 1;
action = pf_test_rule(&pd, &r, &s, &a, &ruleset,
- &reason);
+ &reason, &deferral);
s = pf_state_ref(s);
}
@@ -7193,6 +7198,14 @@ done:
m_freem(pd.m);
/* FALLTHROUGH */
case PF_DEFER:
+#if NPFSYNC > 0
+ /*
+ * We no longer hold PF_LOCK() here, so we can dispatch
+ * deferral if we are asked to do so.
+ */
+ if (deferral != NULL)
+ pfsync_undefer(deferral, 0);
+#endif /* NPFSYNC > 0 */
pd.m = NULL;
action = PF_PASS;
break;