Hello,

re-sending the same diff updated to current if_pfsync.c.

diff avoids a recursion to PF_LOCK(), which is indicated
by panic stack here:

</snip>
> > production firewalls panicked with the pf lock trying to lock against
> > itself a couple of nights ago:
> > 
> > db_enter() at db_enter+0x5
> > panic(ffffffff81d47212) at panic+0x12a
> > rw_enter(ffffffff82060e70,1) at rw_enter+0x261
> > pf_test(18,2,ffff80000158f000,ffff800024d03db8) at pf_test+0x118c
> > ip6_output(fffffd80049a3a00,0,0,0,ffff800024d03eb0,0) at
> > ip6_output+0xd33
> > nd6_ns_output(ffff80000158f000,0,ffff800024d04228,fffffd8279b3b420,0) at
> > nd6_ns
> > _output+0x3e2
> > nd6_resolve(ffff80000158f000,fffffd816c6b2ae0,fffffd80659d8300,ffff800024d04220
> > ,ffff800024d040d8) at nd6_resolve+0x29d
> > ether_resolve(ffff80000158f000,fffffd80659d8300,ffff800024d04220,fffffd816c6b2a
> > e0,ffff800024d040d8) at ether_resolve+0x127
> > ether_output(ffff80000158f000,fffffd80659d8300,ffff800024d04220,fffffd816c6b2ae
> > 0) at ether_output+0x2a
> > ip6_output(fffffd80659d8300,0,0,0,0,0) at ip6_output+0x1180
> > pfsync_undefer_notify(fffffd841dbec7b8) at pfsync_undefer_notify+0xac
> > pfsync_undefer(fffffd841dbec7b8,0) at pfsync_undefer+0x8d
> > pfsync_defer(fffffd82303cb310,fffffd8065a20000) at pfsync_defer+0xfe
> > pf_test_rule(ffff800024d04600,ffff800024d045e8,ffff800024d045e0,ffff800024d045f
> > 0,ffff800024d045d8,ffff800024d045fe) at pf_test_rule+0x693
> > pf_test(2,3,ffff8000015a9000,ffff800024d04798) at pf_test+0x10f1
> > ip_output(fffffd8065a20000,0,ffff800024d04850,1,0,0) at ip_output+0x829
> > ip_forward(fffffd8065a20000,ffff800001541800,fffffd817a7722d0,0) at
> > ip_forward+
> > 0x27a      
> > ip_input_if(ffff800024d04a80,ffff800024d04a7c,4,0,ffff800001541800) at
> > ip_input
> > _if+0x5fd
> > ipv4_input(ffff800001541800,fffffd8065a20000) at ipv4_input+0x37
> > carp_input(ffff80000158f000,fffffd8065a20000,5e000158) at
> > carp_input+0x1ac
> > ether_input(ffff80000158f000,fffffd8065a20000) at ether_input+0x1c0
> > vlan_input(ffff80000152f000,fffffd8065a20000) at vlan_input+0x19a
> > ether_input(ffff80000152f000,fffffd8065a20000) at ether_input+0x76
> > if_input_process(ffff8000001df048,ffff800024d04ca8) at
> > if_input_process+0x5a
> > ifiq_process(ffff8000001dbe00) at ifiq_process+0x6f
> > taskq_thread(ffff80000002b080) at taskq_thread+0x7b
> > end trace frame: 0x0, count: -26
> > 
> 

the recursion to PF_LOCK() is avoided by postponing the call to
pfsync_undefer() to very last phase of pf_test(), where we run
without holding PF_LOCK().

I believe we will be able to revert diff below once scope of PF_LOCK() in
pf_test() will be reduced in future. However future is not here yet. To reduce
PF_LOCK() scope in pf_test() we must give some love to pf's tables first.

comments? OKs?

thanks and
regards
sashan

--------8<---------------8<---------------8<------------------8<--------

diff --git a/sys/net/if_pfsync.c b/sys/net/if_pfsync.c
index f1d292d67ff..9e7dc5064b9 100644
--- a/sys/net/if_pfsync.c
+++ b/sys/net/if_pfsync.c
@@ -1931,7 +1931,7 @@ pfsync_insert_state(struct pf_state *st)
 }
 
 int
-pfsync_defer(struct pf_state *st, struct mbuf *m)
+pfsync_defer(struct pf_state *st, struct mbuf *m, struct pfsync_deferral **ppd)
 {
        struct pfsync_softc *sc = pfsyncif;
        struct pfsync_deferral *pd;
@@ -1944,21 +1944,30 @@ pfsync_defer(struct pf_state *st, struct mbuf *m)
            m->m_flags & (M_BCAST|M_MCAST))
                return (0);
 
+       pd = pool_get(&sc->sc_pool, M_NOWAIT);
+       if (pd == NULL)
+               return (0);
+
+       /*
+        * deferral queue grows faster, than timeout can consume,
+        * we have to ask packet (caller) to help timer and dispatch
+        * one deferral for us.
+        *
+        * We wish to call pfsync_undefer() here. Unfortunately we can't,
+        * because pfsync_undefer() will be calling to ip_output(),
+        * which in turn will call to pf_test(), which would then attempt
+        * to grab PF_LOCK() we currently hold.
+        */
        if (sc->sc_deferred >= 128) {
                mtx_enter(&sc->sc_deferrals_mtx);
-               pd = TAILQ_FIRST(&sc->sc_deferrals);
-               if (pd != NULL) {
-                       TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
+               *ppd = TAILQ_FIRST(&sc->sc_deferrals);
+               if (*ppd != NULL) {
+                       TAILQ_REMOVE(&sc->sc_deferrals, *ppd, pd_entry);
                        sc->sc_deferred--;
                }
                mtx_leave(&sc->sc_deferrals_mtx);
-               if (pd != NULL)
-                       pfsync_undefer(pd, 0);
-       }
-
-       pd = pool_get(&sc->sc_pool, M_NOWAIT);
-       if (pd == NULL)
-               return (0);
+       } else
+               *ppd = NULL;
 
        m->m_pkthdr.pf.flags |= PF_TAG_GENERATED;
        SET(st->state_flags, PFSTATE_ACK);
diff --git a/sys/net/if_pfsync.h b/sys/net/if_pfsync.h
index 2520c87a36c..037219537e8 100644
--- a/sys/net/if_pfsync.h
+++ b/sys/net/if_pfsync.h
@@ -297,6 +297,8 @@ enum pfsync_counters {
 
 extern struct cpumem *pfsynccounters;
 
+struct pfsync_deferral;
+
 static inline void
 pfsyncstat_inc(enum pfsync_counters c)
 {
@@ -335,7 +337,9 @@ void                        pfsync_clear_states(u_int32_t, 
const char *);
 void                   pfsync_update_tdb(struct tdb *, int);
 void                   pfsync_delete_tdb(struct tdb *);
 
-int                    pfsync_defer(struct pf_state *, struct mbuf *);
+int                    pfsync_defer(struct pf_state *, struct mbuf *,
+                           struct pfsync_deferral **);
+void                   pfsync_undefer(struct pfsync_deferral *, int);
 
 int                    pfsync_up(void);
 int                    pfsync_state_in_use(struct pf_state *);
diff --git a/sys/net/pf.c b/sys/net/pf.c
index b684c4b037a..5fe5fcaad70 100644
--- a/sys/net/pf.c
+++ b/sys/net/pf.c
@@ -194,7 +194,8 @@ void                         pf_rule_to_actions(struct 
pf_rule *,
                            struct pf_rule_actions *);
 int                     pf_test_rule(struct pf_pdesc *, struct pf_rule **,
                            struct pf_state **, struct pf_rule **,
-                           struct pf_ruleset **, u_short *);
+                           struct pf_ruleset **, u_short *,
+                           void **);
 static __inline int     pf_create_state(struct pf_pdesc *, struct pf_rule *,
                            struct pf_rule *, struct pf_rule *,
                            struct pf_state_key **, struct pf_state_key **,
@@ -3733,7 +3734,8 @@ pf_match_rule(struct pf_test_ctx *ctx, struct pf_ruleset 
*ruleset)
 
 int
 pf_test_rule(struct pf_pdesc *pd, struct pf_rule **rm, struct pf_state **sm,
-    struct pf_rule **am, struct pf_ruleset **rsm, u_short *reason)
+    struct pf_rule **am, struct pf_ruleset **rsm, u_short *reason,
+    void **pdeferral)
 {
        struct pf_rule          *r = NULL;
        struct pf_rule          *a = NULL;
@@ -3966,7 +3968,8 @@ pf_test_rule(struct pf_pdesc *pd, struct pf_rule **rm, 
struct pf_state **sm,
                 * firewall has to know about it to allow
                 * replies through it.
                 */
-               if (pfsync_defer(*sm, pd->m))
+               if (pfsync_defer(*sm, pd->m,
+                   (struct pfsync_deferral **)pdeferral))
                        return (PF_DEFER);
        }
 #endif /* NPFSYNC > 0 */
@@ -6815,6 +6818,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, 
struct mbuf **m0)
        int                      dir = (fwdir == PF_FWD) ? PF_OUT : fwdir;
        u_int32_t                qid, pqid = 0;
        int                      have_pf_lock = 0;
+       void                    *deferral = NULL;
 
        if (!pf_status.running)
                return (PF_PASS);
@@ -6917,7 +6921,8 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, 
struct mbuf **m0)
                 */
                PF_LOCK();
                have_pf_lock = 1;
-               action = pf_test_rule(&pd, &r, &s, &a, &ruleset, &reason);
+               action = pf_test_rule(&pd, &r, &s, &a, &ruleset, &reason,
+                   &deferral);
                s = pf_state_ref(s);
                if (action != PF_PASS)
                        REASON_SET(&reason, PFRES_FRAG);
@@ -6949,7 +6954,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, 
struct mbuf **m0)
                        PF_LOCK();
                        have_pf_lock = 1;
                        action = pf_test_rule(&pd, &r, &s, &a, &ruleset,
-                           &reason);
+                           &reason, &deferral);
                        s = pf_state_ref(s);
                }
                break;
@@ -6981,7 +6986,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, 
struct mbuf **m0)
                        PF_LOCK();
                        have_pf_lock = 1;
                        action = pf_test_rule(&pd, &r, &s, &a, &ruleset,
-                           &reason);
+                           &reason, &deferral);
                        s = pf_state_ref(s);
                }
                break;
@@ -7057,7 +7062,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, 
struct mbuf **m0)
                        PF_LOCK();
                        have_pf_lock = 1;
                        action = pf_test_rule(&pd, &r, &s, &a, &ruleset,
-                           &reason);
+                           &reason, &deferral);
                        s = pf_state_ref(s);
                }
 
@@ -7193,6 +7198,14 @@ done:
                m_freem(pd.m);
                /* FALLTHROUGH */
        case PF_DEFER:
+#if NPFSYNC > 0
+               /*
+                * We no longer hold PF_LOCK() here, so we can dispatch
+                * deferral if we are asked to do so.
+                */
+               if (deferral != NULL)
+                       pfsync_undefer(deferral, 0);
+#endif /* NPFSYNC > 0 */
                pd.m = NULL;
                action = PF_PASS;
                break;

Reply via email to