Hello,

below is a diff, which makes pfsync(4) more pf_lock friendly. The diff is
sitting in my tree for some time. Hrvoje did some preliminary testing already.
He could trigger some sparks and smoke, which I could put off.  However don't
get too much excited about the change yet. At this phase I really need some
testing focused on spotting any undesired side effects. 

There should be no change in pf(4) + pfsync(4) behaviour in terms of
functionality or stability if diff is applied as-is. All SMP related changes in
pf/pfsync stuff are protected by WITH_PF_LOCK guard. I think it's still worth
to give a try and test diff below _with___out_ 'WITH_PF_LOCK' being set and
watching for changes in pfsync behavior.

Brave hearts can compile kernel with -DWITH_PF_LOCK. For example one can
something like this:
        cd sys/arch/`uname -p`/conf
        echo 'option WIT_PF_LOCK' >> GENERIC.MP
        config GENERIC.MP
        cd ../compile/GENERIC.MP/
        make clean
        make
The kernel, which is compiled with -DWITH_PF_LOCK option will execute a new
pf_lock friendly code in pfsync. But don't expect any performance improvement
yet. Remember there is no concurrency yet, which would run more instances of
pf_test() function.

If testing won't reveal any problems I'll re-post the diff asking for OKs.

The diff just extends current pfsync implementation so pfsync_update_state()
function can run concurrently.

Every state queue maintained by pfsync gets its own access to synchronize
insert/remove operations.

The send operation (pfsync_sendout()) is split to two steps:
    the first stop is invoked synchronously on behalf of caller from pf
    or periodic task. Here we grab mutex to grab all states from global
    queues to local data (a snapshot). We release mutex once we have
    snapshot.

    The next step moves data from snapshot to pfsync packet. Once this
    is done, we dispatch packet to task, which pushes packet to network.

The changes and reception side are not significant, we just teach pfsync
to grab PF_LOCK() and state rw-lock to insert states received from peer.


feedback and testing is much appreciated.

thanks and
regards
sashan


--------8<---------------8<---------------8<------------------8<--------
diff --git a/sys/net/if_pfsync.c b/sys/net/if_pfsync.c
index 34dd370078a..e0508623b97 100644
--- a/sys/net/if_pfsync.c
+++ b/sys/net/if_pfsync.c
@@ -210,9 +210,11 @@ struct pfsync_softc {
        struct ip                sc_template;
 
        struct pf_state_queue    sc_qs[PFSYNC_S_COUNT];
+       struct mutex             sc_mtx[PFSYNC_S_COUNT];
        size_t                   sc_len;
 
        struct pfsync_upd_reqs   sc_upd_req_list;
+       struct mutex             sc_upd_req_mtx;
 
        int                      sc_initial_bulk;
        int                      sc_link_demoted;
@@ -220,6 +222,7 @@ struct pfsync_softc {
        int                      sc_defer;
        struct pfsync_deferrals  sc_deferrals;
        u_int                    sc_deferred;
+       struct mutex             sc_deferrals_mtx;
 
        void                    *sc_plus;
        size_t                   sc_pluslen;
@@ -234,6 +237,7 @@ struct pfsync_softc {
        struct timeout           sc_bulk_tmo;
 
        TAILQ_HEAD(, tdb)        sc_tdb_q;
+       struct mutex             sc_tdb_mtx;
 
        struct task              sc_ltask;
        struct task              sc_dtask;
@@ -241,6 +245,16 @@ struct pfsync_softc {
        struct timeout           sc_tmo;
 };
 
+struct pfsync_snapshot {
+       struct pfsync_softc     *sn_sc;
+       struct pf_state_queue    sn_qs[PFSYNC_S_COUNT];
+       struct pfsync_upd_reqs   sn_upd_req_list;
+       TAILQ_HEAD(, tdb)        sn_tdb_q;
+       size_t                   sn_len;
+       void                    *sn_plus;
+       size_t                   sn_pluslen;
+};
+
 struct pfsync_softc    *pfsyncif = NULL;
 struct cpumem          *pfsynccounters;
 
@@ -276,6 +290,10 @@ void       pfsync_bulk_start(void);
 void   pfsync_bulk_status(u_int8_t);
 void   pfsync_bulk_update(void *);
 void   pfsync_bulk_fail(void *);
+
+void   pfsync_grab_snapshot(struct pfsync_snapshot *, struct pfsync_softc *);
+void   pfsync_drop_snapshot(struct pfsync_snapshot *);
+
 #ifdef WITH_PF_LOCK
 void   pfsync_send_dispatch(void *);
 void   pfsync_send_pkt(struct mbuf *);
@@ -314,18 +332,23 @@ pfsync_clone_create(struct if_clone *ifc, int unit)
        pfsync_sync_ok = 1;
 
        sc = malloc(sizeof(*pfsyncif), M_DEVBUF, M_WAITOK|M_ZERO);
-       for (q = 0; q < PFSYNC_S_COUNT; q++)
+       for (q = 0; q < PFSYNC_S_COUNT; q++) {
                TAILQ_INIT(&sc->sc_qs[q]);
+               mtx_init(&sc->sc_mtx[q], IPL_SOFTNET);
+       }
 
        pool_init(&sc->sc_pool, PFSYNC_PLSIZE, 0, IPL_SOFTNET, 0, "pfsync",
            NULL);
        TAILQ_INIT(&sc->sc_upd_req_list);
+       mtx_init(&sc->sc_upd_req_mtx, IPL_SOFTNET);
        TAILQ_INIT(&sc->sc_deferrals);
+       mtx_init(&sc->sc_deferrals_mtx, IPL_SOFTNET);
        task_set(&sc->sc_ltask, pfsync_syncdev_state, sc);
        task_set(&sc->sc_dtask, pfsync_ifdetach, sc);
        sc->sc_deferred = 0;
 
        TAILQ_INIT(&sc->sc_tdb_q);
+       mtx_init(&sc->sc_tdb_mtx, IPL_SOFTNET);
 
        sc->sc_len = PFSYNC_MINPKT;
        sc->sc_maxupdates = 128;
@@ -370,6 +393,7 @@ pfsync_clone_destroy(struct ifnet *ifp)
 {
        struct pfsync_softc *sc = ifp->if_softc;
        struct pfsync_deferral *pd;
+       struct pfsync_deferrals  deferrals;
 
        NET_LOCK();
 
@@ -391,10 +415,18 @@ pfsync_clone_destroy(struct ifnet *ifp)
 
        pfsync_drop(sc);
 
-       while (sc->sc_deferred > 0) {
-               pd = TAILQ_FIRST(&sc->sc_deferrals);
-               timeout_del(&pd->pd_tmo);
-               pfsync_undefer(pd, 0);
+       if (sc->sc_deferred > 0) {
+               TAILQ_INIT(&deferrals);
+               mtx_enter(&sc->sc_deferrals_mtx);
+               TAILQ_CONCAT(&deferrals, &sc->sc_deferrals, pd_entry);
+               sc->sc_deferred = 0;
+               mtx_leave(&sc->sc_deferrals_mtx);
+
+               while (!TAILQ_EMPTY(&deferrals)) {
+                       pd = TAILQ_FIRST(&deferrals);
+                       TAILQ_REMOVE(&deferrals, pd, pd_entry);
+                       pfsync_undefer(pd, 0);
+               }
        }
 
        pfsyncif = NULL;
@@ -764,10 +796,8 @@ pfsync_input(struct mbuf **mp, int *offp, int proto, int 
af)
                        return IPPROTO_DONE;
                }
 
-               PF_LOCK();
                e = pfsync_acts[subh.action].in(n->m_data + noff, mlen, count,
                    flags);
-               PF_UNLOCK();
                if (e != 0)
                        goto done;
 
@@ -788,6 +818,7 @@ pfsync_in_clr(caddr_t buf, int len, int count, int flags)
        u_int32_t creatorid;
        int i;
 
+       PF_LOCK();
        for (i = 0; i < count; i++) {
                clr = (struct pfsync_clr *)buf + len * i;
                kif = NULL;
@@ -796,6 +827,7 @@ pfsync_in_clr(caddr_t buf, int len, int count, int flags)
                    (kif = pfi_kif_find(clr->ifname)) == NULL)
                        continue;
 
+               PF_STATE_ENTER_WRITE();
                for (st = RB_MIN(pf_state_tree_id, &tree_id); st; st = nexts) {
                        nexts = RB_NEXT(pf_state_tree_id, &tree_id, st);
                        if (st->creatorid == creatorid &&
@@ -804,7 +836,9 @@ pfsync_in_clr(caddr_t buf, int len, int count, int flags)
                                pf_remove_state(st);
                        }
                }
+               PF_STATE_EXIT_WRITE();
        }
+       PF_UNLOCK();
 
        return (0);
 }
@@ -816,6 +850,7 @@ pfsync_in_ins(caddr_t buf, int len, int count, int flags)
        sa_family_t af1, af2;
        int i;
 
+       PF_LOCK();
        for (i = 0; i < count; i++) {
                sp = (struct pfsync_state *)(buf + len * i);
                af1 = sp->key[0].af;
@@ -841,6 +876,7 @@ pfsync_in_ins(caddr_t buf, int len, int count, int flags)
                        break;
                }
        }
+       PF_UNLOCK();
 
        return (0);
 }
@@ -859,12 +895,17 @@ pfsync_in_iack(caddr_t buf, int len, int count, int flags)
                id_key.id = ia->id;
                id_key.creatorid = ia->creatorid;
 
+               PF_STATE_ENTER_READ();
                st = pf_find_state_byid(&id_key);
+               pf_state_ref(st);
+               PF_STATE_EXIT_READ();
                if (st == NULL)
                        continue;
 
                if (ISSET(st->state_flags, PFSTATE_ACK))
                        pfsync_deferred(st, 0);
+
+               pf_state_unref(st);
        }
 
        return (0);
@@ -908,8 +949,7 @@ pfsync_in_upd(caddr_t buf, int len, int count, int flags)
        struct pfsync_state *sp;
        struct pf_state_cmp id_key;
        struct pf_state *st;
-       int sync;
-
+       int sync, error;
        int i;
 
        for (i = 0; i < count; i++) {
@@ -928,11 +968,17 @@ pfsync_in_upd(caddr_t buf, int len, int count, int flags)
                id_key.id = sp->id;
                id_key.creatorid = sp->creatorid;
 
+               PF_STATE_ENTER_READ();
                st = pf_find_state_byid(&id_key);
+               pf_state_ref(st);
+               PF_STATE_EXIT_READ();
                if (st == NULL) {
                        /* insert the update */
-                       if (pfsync_state_import(sp, flags))
+                       PF_LOCK();
+                       error = pfsync_state_import(sp, flags);
+                       if (error)
                                pfsyncstat_inc(pfsyncs_badstate);
+                       PF_UNLOCK();
                        continue;
                }
 
@@ -970,9 +1016,11 @@ pfsync_in_upd(caddr_t buf, int len, int count, int flags)
                if (sync) {
                        pfsyncstat_inc(pfsyncs_stale);
 
-                       pfsync_update_state_locked(st);
+                       pfsync_update_state(st);
                        schednetisr(NETISR_PFSYNC);
                }
+
+               pf_state_unref(st);
        }
 
        return (0);
@@ -1005,7 +1053,10 @@ pfsync_in_upd_c(caddr_t buf, int len, int count, int 
flags)
                id_key.id = up->id;
                id_key.creatorid = up->creatorid;
 
+               PF_STATE_ENTER_READ();
                st = pf_find_state_byid(&id_key);
+               pf_state_ref(st);
+               PF_STATE_EXIT_READ();
                if (st == NULL) {
                        /* We don't have this state. Ask for it. */
                        pfsync_request_update(id_key.creatorid, id_key.id);
@@ -1044,9 +1095,11 @@ pfsync_in_upd_c(caddr_t buf, int len, int count, int 
flags)
                if (sync) {
                        pfsyncstat_inc(pfsyncs_stale);
 
-                       pfsync_update_state_locked(st);
+                       pfsync_update_state(st);
                        schednetisr(NETISR_PFSYNC);
                }
+
+               pf_state_unref(st);
        }
 
        return (0);
@@ -1070,15 +1123,21 @@ pfsync_in_ureq(caddr_t buf, int len, int count, int 
flags)
                if (id_key.id == 0 && id_key.creatorid == 0)
                        pfsync_bulk_start();
                else {
+                       PF_STATE_ENTER_READ();
                        st = pf_find_state_byid(&id_key);
+                       pf_state_ref(st);
+                       PF_STATE_EXIT_READ();
                        if (st == NULL) {
                                pfsyncstat_inc(pfsyncs_badstate);
                                continue;
                        }
-                       if (ISSET(st->state_flags, PFSTATE_NOSYNC))
+                       if (ISSET(st->state_flags, PFSTATE_NOSYNC)) {
+                               pf_state_unref(st);
                                continue;
+                       }
 
                        pfsync_update_state_req(st);
+                       pf_state_unref(st);
                }
        }
 
@@ -1093,6 +1152,7 @@ pfsync_in_del(caddr_t buf, int len, int count, int flags)
        struct pf_state *st;
        int i;
 
+       PF_STATE_ENTER_WRITE();
        for (i = 0; i < count; i++) {
                sp = (struct pfsync_state *)(buf + len * i);
 
@@ -1101,12 +1161,14 @@ pfsync_in_del(caddr_t buf, int len, int count, int 
flags)
 
                st = pf_find_state_byid(&id_key);
                if (st == NULL) {
+                       PF_STATE_EXIT_WRITE();
                        pfsyncstat_inc(pfsyncs_badstate);
                        continue;
                }
                SET(st->state_flags, PFSTATE_NOSYNC);
                pf_remove_state(st);
        }
+       PF_STATE_EXIT_WRITE();
 
        return (0);
 }
@@ -1119,6 +1181,8 @@ pfsync_in_del_c(caddr_t buf, int len, int count, int 
flags)
        struct pf_state *st;
        int i;
 
+       PF_LOCK();
+       PF_STATE_ENTER_WRITE();
        for (i = 0; i < count; i++) {
                sp = (struct pfsync_del_c *)(buf + len * i);
 
@@ -1134,6 +1198,8 @@ pfsync_in_del_c(caddr_t buf, int len, int count, int 
flags)
                SET(st->state_flags, PFSTATE_NOSYNC);
                pf_remove_state(st);
        }
+       PF_STATE_EXIT_WRITE();
+       PF_UNLOCK();
 
        return (0);
 }
@@ -1473,19 +1539,59 @@ pfsync_out_del(struct pf_state *st, void *buf)
 }
 
 void
-pfsync_drop(struct pfsync_softc *sc)
+pfsync_grab_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc *sc)
+{
+       int q;
+
+       sn->sn_sc = sc;
+
+       for (q = 0; q < PFSYNC_S_COUNT; q++)
+               mtx_enter(&sc->sc_mtx[q]);
+
+       mtx_enter(&sc->sc_upd_req_mtx);
+       mtx_enter(&sc->sc_tdb_mtx);
+
+       for (q = 0; q < PFSYNC_S_COUNT; q++) {
+               TAILQ_INIT(&sn->sn_qs[q]);
+               TAILQ_CONCAT(&sn->sn_qs[q], &sc->sc_qs[q], sync_list);
+       }
+
+       TAILQ_INIT(&sn->sn_upd_req_list);
+       TAILQ_CONCAT(&sn->sn_upd_req_list, &sc->sc_upd_req_list, ur_entry);
+
+       TAILQ_INIT(&sn->sn_tdb_q);
+       TAILQ_CONCAT(&sn->sn_tdb_q, &sc->sc_tdb_q, tdb_sync_entry);
+
+       sn->sn_len = sc->sc_len;
+       sc->sc_len = PFSYNC_MINPKT;
+
+       sn->sn_plus = sc->sc_plus;
+       sc->sc_plus = NULL;
+       sn->sn_pluslen = sc->sc_pluslen;
+       sc->sc_pluslen = 0;
+
+       mtx_leave(&sc->sc_tdb_mtx);
+       mtx_leave(&sc->sc_upd_req_mtx);
+
+       for (q = (PFSYNC_S_COUNT - 1); q >= 0; q--)
+               mtx_leave(&sc->sc_mtx[q]);
+}
+
+void
+pfsync_drop_snapshot(struct pfsync_snapshot *sn)
 {
        struct pf_state *st;
        struct pfsync_upd_req_item *ur;
        struct tdb *t;
        int q;
 
+
        for (q = 0; q < PFSYNC_S_COUNT; q++) {
-               if (TAILQ_EMPTY(&sc->sc_qs[q]))
+               if (TAILQ_EMPTY(&sn->sn_qs[q]))
                        continue;
 
-               while ((st = TAILQ_FIRST(&sc->sc_qs[q])) != NULL) {
-                       TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
+               while ((st = TAILQ_FIRST(&sn->sn_qs[q])) != NULL) {
+                       TAILQ_REMOVE(&sn->sn_qs[q], st, sync_list);
 #ifdef PFSYNC_DEBUG
                        KASSERT(st->sync_state == q);
 #endif
@@ -1494,19 +1600,42 @@ pfsync_drop(struct pfsync_softc *sc)
                }
        }
 
-       while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
-               TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
-               pool_put(&sc->sc_pool, ur);
+       while ((ur = TAILQ_FIRST(&sn->sn_upd_req_list)) != NULL) {
+               TAILQ_REMOVE(&sn->sn_upd_req_list, ur, ur_entry);
+               pool_put(&sn->sn_sc->sc_pool, ur);
        }
 
-       sc->sc_plus = NULL;
-
-       while ((t = TAILQ_FIRST(&sc->sc_tdb_q)) != NULL) {
-               TAILQ_REMOVE(&sc->sc_tdb_q, t, tdb_sync_entry);
+       while ((t = TAILQ_FIRST(&sn->sn_tdb_q)) != NULL) {
+               TAILQ_REMOVE(&sn->sn_tdb_q, t, tdb_sync_entry);
                CLR(t->tdb_flags, TDBF_PFSYNC);
        }
+}
 
-       sc->sc_len = PFSYNC_MINPKT;
+int
+pfsync_is_snapshot_empty(struct pfsync_snapshot *sn)
+{
+       int     q;
+
+       for (q = 0; q < PFSYNC_S_COUNT; q++)
+               if (!TAILQ_EMPTY(&sn->sn_qs[q]))
+                       return (0);
+
+       if (!TAILQ_EMPTY(&sn->sn_upd_req_list))
+               return (0);
+
+       if (!TAILQ_EMPTY(&sn->sn_tdb_q))
+               return (0);
+
+       return (sn->sn_plus == NULL);
+}
+
+void
+pfsync_drop(struct pfsync_softc *sc)
+{
+       struct pfsync_snapshot  sn;
+
+       pfsync_grab_snapshot(&sn, sc);
+       pfsync_drop_snapshot(&sn);
 }
 
 #ifdef WITH_PF_LOCK
@@ -1559,6 +1688,7 @@ pfsync_send_pkt(struct mbuf *m)
 void
 pfsync_sendout(void)
 {
+       struct pfsync_snapshot sn;
        struct pfsync_softc *sc = pfsyncif;
 #if NBPFILTER > 0
        struct ifnet *ifp = &sc->sc_if;
@@ -1570,12 +1700,9 @@ pfsync_sendout(void)
        struct pf_state *st;
        struct pfsync_upd_req_item *ur;
        struct tdb *t;
-
        int offset;
        int q, count = 0;
 
-       PF_ASSERT_LOCKED();
-
        if (sc == NULL || sc->sc_len == PFSYNC_MINPKT)
                return;
 
@@ -1589,26 +1716,35 @@ pfsync_sendout(void)
                return;
        }
 
+       pfsync_grab_snapshot(&sn, sc);
+
+       /*
+        * Check below is sufficient to prevent us from sending empty packets,
+        * but it does not stop us from sending short packets.
+        */
+       if (pfsync_is_snapshot_empty(&sn))
+               return;
+
        MGETHDR(m, M_DONTWAIT, MT_DATA);
        if (m == NULL) {
                sc->sc_if.if_oerrors++;
                pfsyncstat_inc(pfsyncs_onomem);
-               pfsync_drop(sc);
+               pfsync_drop_snapshot(&sn);
                return;
        }
 
-       if (max_linkhdr + sc->sc_len > MHLEN) {
-               MCLGETI(m, M_DONTWAIT, NULL, max_linkhdr + sc->sc_len);
+       if (max_linkhdr + sn.sn_len > MHLEN) {
+               MCLGETI(m, M_DONTWAIT, NULL, max_linkhdr + sn.sn_len);
                if (!ISSET(m->m_flags, M_EXT)) {
                        m_free(m);
                        sc->sc_if.if_oerrors++;
                        pfsyncstat_inc(pfsyncs_onomem);
-                       pfsync_drop(sc);
+                       pfsync_drop_snapshot(&sn);
                        return;
                }
        }
        m->m_data += max_linkhdr;
-       m->m_len = m->m_pkthdr.len = sc->sc_len;
+       m->m_len = m->m_pkthdr.len = sn.sn_len;
 
        /* build the ip header */
        ip = mtod(m, struct ip *);
@@ -1624,16 +1760,16 @@ pfsync_sendout(void)
        offset += sizeof(*ph);
 
        ph->version = PFSYNC_VERSION;
-       ph->len = htons(sc->sc_len - sizeof(*ip));
+       ph->len = htons(sn.sn_len - sizeof(*ip));
        bcopy(pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH);
 
-       if (!TAILQ_EMPTY(&sc->sc_upd_req_list)) {
+       if (!TAILQ_EMPTY(&sn.sn_upd_req_list)) {
                subh = (struct pfsync_subheader *)(m->m_data + offset);
                offset += sizeof(*subh);
 
                count = 0;
-               while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
-                       TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
+               while ((ur = TAILQ_FIRST(&sn.sn_upd_req_list)) != NULL) {
+                       TAILQ_REMOVE(&sn.sn_upd_req_list, ur, ur_entry);
 
                        bcopy(&ur->ur_msg, m->m_data + offset,
                            sizeof(ur->ur_msg));
@@ -1651,20 +1787,19 @@ pfsync_sendout(void)
        }
 
        /* has someone built a custom region for us to add? */
-       if (sc->sc_plus != NULL) {
-               bcopy(sc->sc_plus, m->m_data + offset, sc->sc_pluslen);
-               offset += sc->sc_pluslen;
-
-               sc->sc_plus = NULL;
+       if (sn.sn_plus != NULL) {
+               bcopy(sn.sn_plus, m->m_data + offset, sn.sn_pluslen);
+               offset += sn.sn_pluslen;
+               sn.sn_plus = NULL;      /* XXX memory leak ? */
        }
 
-       if (!TAILQ_EMPTY(&sc->sc_tdb_q)) {
+       if (!TAILQ_EMPTY(&sn.sn_tdb_q)) {
                subh = (struct pfsync_subheader *)(m->m_data + offset);
                offset += sizeof(*subh);
 
                count = 0;
-               while ((t = TAILQ_FIRST(&sc->sc_tdb_q)) != NULL) {
-                       TAILQ_REMOVE(&sc->sc_tdb_q, t, tdb_sync_entry);
+               while ((t = TAILQ_FIRST(&sn.sn_tdb_q)) != NULL) {
+                       TAILQ_REMOVE(&sn.sn_tdb_q, t, tdb_sync_entry);
                        pfsync_out_tdb(t, m->m_data + offset);
                        offset += sizeof(struct pfsync_tdb);
                        CLR(t->tdb_flags, TDBF_PFSYNC);
@@ -1679,15 +1814,15 @@ pfsync_sendout(void)
 
        /* walk the queues */
        for (q = 0; q < PFSYNC_S_COUNT; q++) {
-               if (TAILQ_EMPTY(&sc->sc_qs[q]))
+               if (TAILQ_EMPTY(&sn.sn_qs[q]))
                        continue;
 
                subh = (struct pfsync_subheader *)(m->m_data + offset);
                offset += sizeof(*subh);
 
                count = 0;
-               while ((st = TAILQ_FIRST(&sc->sc_qs[q])) != NULL) {
-                       TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
+               while ((st = TAILQ_FIRST(&sn.sn_qs[q])) != NULL) {
+                       TAILQ_REMOVE(&sn.sn_qs[q], st, sync_list);
                        st->sync_state = PFSYNC_S_NONE;
 #ifdef PFSYNC_DEBUG
                        KASSERT(st->sync_state == q);
@@ -1709,22 +1844,18 @@ pfsync_sendout(void)
 #if NBPFILTER > 0
        if (ifp->if_bpf) {
                m->m_data += sizeof(*ip);
-               m->m_len = m->m_pkthdr.len = sc->sc_len - sizeof(*ip);
+               m->m_len = m->m_pkthdr.len = sn.sn_len - sizeof(*ip);
                bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT);
                m->m_data -= sizeof(*ip);
-               m->m_len = m->m_pkthdr.len = sc->sc_len;
+               m->m_len = m->m_pkthdr.len = sn.sn_len;
        }
 
        if (sc->sc_sync_if == NULL) {
-               sc->sc_len = PFSYNC_MINPKT;
                m_freem(m);
                return;
        }
 #endif
 
-       /* start again */
-       sc->sc_len = PFSYNC_MINPKT;
-
        sc->sc_if.if_opackets++;
        sc->sc_if.if_obytes += m->m_pkthdr.len;
 
@@ -1783,7 +1914,11 @@ pfsync_defer(struct pf_state *st, struct mbuf *m)
                return (0);
 
        if (sc->sc_deferred >= 128) {
+               mtx_enter(&sc->sc_deferrals_mtx);
                pd = TAILQ_FIRST(&sc->sc_deferrals);
+               TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
+               sc->sc_deferred--;
+               mtx_leave(&sc->sc_deferrals_mtx);
                if (timeout_del(&pd->pd_tmo))
                        pfsync_undefer(pd, 0);
        }
@@ -1798,8 +1933,10 @@ pfsync_defer(struct pf_state *st, struct mbuf *m)
        pd->pd_st = pf_state_ref(st);
        pd->pd_m = m;
 
+       mtx_enter(&sc->sc_deferrals_mtx);
        sc->sc_deferred++;
        TAILQ_INSERT_TAIL(&sc->sc_deferrals, pd, pd_entry);
+       mtx_leave(&sc->sc_deferrals_mtx);
 
        timeout_set_proc(&pd->pd_tmo, pfsync_defer_tmo, pd);
        timeout_add_msec(&pd->pd_tmo, 20);
@@ -1810,73 +1947,96 @@ pfsync_defer(struct pf_state *st, struct mbuf *m)
 }
 
 void
-pfsync_undefer(struct pfsync_deferral *pd, int drop)
+pfsync_undefer_notify(struct pfsync_deferral *pd)
 {
-       struct pfsync_softc *sc = pfsyncif;
        struct pf_pdesc pdesc;
 
-       NET_ASSERT_LOCKED();
-
-       if (sc == NULL)
-               return;
-
-       TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
-       sc->sc_deferred--;
-
-       CLR(pd->pd_st->state_flags, PFSTATE_ACK);
-       if (drop)
-               m_freem(pd->pd_m);
-       else {
-               if (pd->pd_st->rule.ptr->rt == PF_ROUTETO) {
-                       if (pf_setup_pdesc(&pdesc,
-                           pd->pd_st->key[PF_SK_WIRE]->af,
-                           pd->pd_st->direction, pd->pd_st->rt_kif,
-                           pd->pd_m, NULL) != PF_PASS) {
-                               m_freem(pd->pd_m);
-                               goto out;
-                       }
-                       switch (pd->pd_st->key[PF_SK_WIRE]->af) {
-                       case AF_INET:
-                               pf_route(&pdesc,
-                                   pd->pd_st->rule.ptr, pd->pd_st);
-                               break;
+       if (pd->pd_st->rule.ptr->rt == PF_ROUTETO) {
+               if (pf_setup_pdesc(&pdesc,
+                   pd->pd_st->key[PF_SK_WIRE]->af,
+                   pd->pd_st->direction, pd->pd_st->rt_kif,
+                   pd->pd_m, NULL) != PF_PASS) {
+                       m_freem(pd->pd_m);
+                       return;
+               }
+               switch (pd->pd_st->key[PF_SK_WIRE]->af) {
+               case AF_INET:
+                       pf_route(&pdesc,
+                           pd->pd_st->rule.ptr, pd->pd_st);
+                       break;
 #ifdef INET6
-                       case AF_INET6:
-                               pf_route6(&pdesc,
-                                   pd->pd_st->rule.ptr, pd->pd_st);
-                               break;
+               case AF_INET6:
+                       pf_route6(&pdesc,
+                           pd->pd_st->rule.ptr, pd->pd_st);
+                       break;
 #endif /* INET6 */
-                       default:
-                               unhandled_af(pd->pd_st->key[PF_SK_WIRE]->af);
-                       }
-                       pd->pd_m = pdesc.m;
-               } else {
-                       switch (pd->pd_st->key[PF_SK_WIRE]->af) {
-                       case AF_INET:
-                               ip_output(pd->pd_m, NULL, NULL, 0, NULL, NULL,
-                                   0);
-                               break;
+               default:
+                       unhandled_af(pd->pd_st->key[PF_SK_WIRE]->af);
+               }
+               pd->pd_m = pdesc.m;
+       } else {
+               switch (pd->pd_st->key[PF_SK_WIRE]->af) {
+               case AF_INET:
+                       ip_output(pd->pd_m, NULL, NULL, 0, NULL, NULL,
+                           0);
+                       break;
 #ifdef INET6
-                       case AF_INET6:
-                               ip6_output(pd->pd_m, NULL, NULL, 0,
-                                   NULL, NULL);
-                               break;
+               case AF_INET6:
+                       ip6_output(pd->pd_m, NULL, NULL, 0,
+                           NULL, NULL);
+                       break;
 #endif /* INET6 */
-                       default:
-                               unhandled_af(pd->pd_st->key[PF_SK_WIRE]->af);
-                       }
+               default:
+                       unhandled_af(pd->pd_st->key[PF_SK_WIRE]->af);
                }
+
+               pd->pd_m = NULL;
        }
- out:
+}
+
+void
+pfsync_free_deferral(struct pfsync_deferral *pd)
+{
+       struct pfsync_softc *sc = pfsyncif;
+
        pf_state_unref(pd->pd_st);
+       if (pd->pd_m != NULL)
+               m_freem(pd->pd_m);
        pool_put(&sc->sc_pool, pd);
 }
 
+void
+pfsync_undefer(struct pfsync_deferral *pd, int drop)
+{
+       struct pfsync_softc *sc = pfsyncif;
+
+       NET_ASSERT_LOCKED();
+
+       if (sc == NULL)
+               return;
+
+       CLR(pd->pd_st->state_flags, PFSTATE_ACK);
+       if (drop) {
+               m_freem(pd->pd_m);
+               pd->pd_m = NULL;
+       } else
+               pfsync_undefer_notify(pd);
+
+       pfsync_free_deferral(pd);
+}
+
 void
 pfsync_defer_tmo(void *arg)
 {
+       struct pfsync_softc *sc = pfsyncif;
+       struct pfsync_deferral *pd = arg;
+
+       mtx_enter(&sc->sc_deferrals_mtx);
+       TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
+       sc->sc_deferred--;
+       mtx_leave(&sc->sc_deferrals_mtx);
        NET_LOCK();
-       pfsync_undefer(arg, 0);
+       pfsync_undefer(pd, 0);
        NET_UNLOCK();
 }
 
@@ -1888,25 +2048,29 @@ pfsync_deferred(struct pf_state *st, int drop)
 
        NET_ASSERT_LOCKED();
 
+       mtx_enter(&sc->sc_deferrals_mtx);
        TAILQ_FOREACH(pd, &sc->sc_deferrals, pd_entry) {
                 if (pd->pd_st == st) {
-                       if (timeout_del(&pd->pd_tmo))
+                       if (timeout_del(&pd->pd_tmo)) {
+                               TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
+                               sc->sc_deferred--;
+                               mtx_leave(&sc->sc_deferrals_mtx);
                                pfsync_undefer(pd, drop);
+                       } else
+                               mtx_leave(&sc->sc_deferrals_mtx);
                        return;
                }
        }
-
-       panic("pfsync_deferred: unable to find deferred state");
+       mtx_leave(&sc->sc_deferrals_mtx);
 }
 
 void
-pfsync_update_state_locked(struct pf_state *st)
+pfsync_update_state(struct pf_state *st)
 {
        struct pfsync_softc *sc = pfsyncif;
        int sync = 0;
 
        NET_ASSERT_LOCKED();
-       PF_ASSERT_LOCKED();
 
        if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
                return;
@@ -1951,22 +2115,6 @@ pfsync_update_state_locked(struct pf_state *st)
                schednetisr(NETISR_PFSYNC);
 }
 
-void
-pfsync_update_state(struct pf_state *st, int *have_pf_lock)
-{
-       struct pfsync_softc *sc = pfsyncif;
-
-       if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING))
-               return;
-
-       if (*have_pf_lock == 0) {
-               PF_LOCK();
-               *have_pf_lock = 1;
-       }
-
-       pfsync_update_state_locked(st);
-}
-
 void
 pfsync_cancel_full_update(struct pfsync_softc *sc)
 {
@@ -2019,7 +2167,7 @@ pfsync_request_update(u_int32_t creatorid, u_int64_t id)
 {
        struct pfsync_softc *sc = pfsyncif;
        struct pfsync_upd_req_item *item;
-       size_t nlen = sizeof(struct pfsync_upd_req);
+       size_t nlen, sc_len;
 
        /*
         * this code does nothing to prevent multiple update requests for the
@@ -2035,18 +2183,24 @@ pfsync_request_update(u_int32_t creatorid, u_int64_t id)
        item->ur_msg.id = id;
        item->ur_msg.creatorid = creatorid;
 
-       if (TAILQ_EMPTY(&sc->sc_upd_req_list))
-               nlen += sizeof(struct pfsync_subheader);
+       do {
+               mtx_enter(&sc->sc_upd_req_mtx);
 
-       if (sc->sc_len + nlen > sc->sc_if.if_mtu) {
-               pfsync_sendout();
+               nlen = sizeof(struct pfsync_upd_req);
+               if (TAILQ_EMPTY(&sc->sc_upd_req_list))
+                       nlen += sizeof(struct pfsync_subheader);
 
-               nlen = sizeof(struct pfsync_subheader) +
-                   sizeof(struct pfsync_upd_req);
-       }
+               sc_len = atomic_add_long_nv(&sc->sc_len, nlen);
+               if (sc_len > sc->sc_if.if_mtu) {
+                       atomic_sub_long(&sc->sc_len, nlen);
+                       mtx_leave(&sc->sc_upd_req_mtx);
+                       pfsync_sendout();
+                       continue;
+               }
 
-       TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry);
-       sc->sc_len += nlen;
+               TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry);
+               mtx_leave(&sc->sc_upd_req_mtx);
+       } while (0);
 
        schednetisr(NETISR_PFSYNC);
 }
@@ -2172,27 +2326,45 @@ void
 pfsync_q_ins(struct pf_state *st, int q)
 {
        struct pfsync_softc *sc = pfsyncif;
-       size_t nlen = pfsync_qs[q].len;
+       size_t nlen, sc_len;
 
        KASSERT(st->sync_state == PFSYNC_S_NONE);
 
 #if defined(PFSYNC_DEBUG)
        if (sc->sc_len < PFSYNC_MINPKT)
-               panic("pfsync pkt len is too low %d", sc->sc_len);
+               panic("pfsync pkt len is too low %zu", sc->sc_len);
 #endif
-       if (TAILQ_EMPTY(&sc->sc_qs[q]))
-               nlen += sizeof(struct pfsync_subheader);
+       do {
+               mtx_enter(&sc->sc_mtx[q]);
 
-       if (sc->sc_len + nlen > sc->sc_if.if_mtu) {
-               pfsync_sendout();
+               /*
+                * If two threads are competing to insert the same state, then
+                * there must be just single winner.
+                */
+               if (st->sync_state != PFSYNC_S_NONE) {
+                       mtx_leave(&sc->sc_mtx[q]);
+                       break;
+               }
 
-               nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len;
-       }
+               nlen = pfsync_qs[q].len;
+
+               if (TAILQ_EMPTY(&sc->sc_qs[q]))
+                       nlen += sizeof(struct pfsync_subheader);
+
+               sc_len = atomic_add_long_nv(&sc->sc_len, nlen);
+               if (sc_len > sc->sc_if.if_mtu) {
+                       atomic_sub_long(&sc->sc_len, nlen);
+                       mtx_leave(&sc->sc_mtx[q]);
+                       pfsync_sendout();
+                       continue;
+               }
+
+               pf_state_ref(st);
 
-       sc->sc_len += nlen;
-       pf_state_ref(st);
-       TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list);
-       st->sync_state = q;
+               TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list);
+               st->sync_state = q;
+               mtx_leave(&sc->sc_mtx[q]);
+       } while (0);
 }
 
 void
@@ -2203,39 +2375,49 @@ pfsync_q_del(struct pf_state *st)
 
        KASSERT(st->sync_state != PFSYNC_S_NONE);
 
-       sc->sc_len -= pfsync_qs[q].len;
+       mtx_enter(&sc->sc_mtx[q]);
+       atomic_sub_long(&sc->sc_len, pfsync_qs[q].len);
        TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
+       if (TAILQ_EMPTY(&sc->sc_qs[q]))
+               atomic_sub_long(&sc->sc_len, sizeof (struct pfsync_subheader));
+       mtx_leave(&sc->sc_mtx[q]);
+
        st->sync_state = PFSYNC_S_NONE;
        pf_state_unref(st);
 
-       if (TAILQ_EMPTY(&sc->sc_qs[q]))
-               sc->sc_len -= sizeof(struct pfsync_subheader);
 }
 
 void
 pfsync_update_tdb(struct tdb *t, int output)
 {
        struct pfsync_softc *sc = pfsyncif;
-       size_t nlen = sizeof(struct pfsync_tdb);
+       size_t nlen, sc_len;
 
        if (sc == NULL)
                return;
 
        if (!ISSET(t->tdb_flags, TDBF_PFSYNC)) {
-               if (TAILQ_EMPTY(&sc->sc_tdb_q))
-                       nlen += sizeof(struct pfsync_subheader);
-
-               if (sc->sc_len + nlen > sc->sc_if.if_mtu) {
-                       pfsync_sendout();
+               do {
+                       mtx_enter(&sc->sc_tdb_mtx);
+                       nlen = sizeof(struct pfsync_tdb);
+
+                       if (TAILQ_EMPTY(&sc->sc_tdb_q))
+                               nlen += sizeof(struct pfsync_subheader);
+
+                       sc_len = atomic_add_long_nv(&sc->sc_len, nlen);
+                       if (sc_len > sc->sc_if.if_mtu) {
+                               atomic_sub_long(&sc->sc_len, nlen);
+                               mtx_leave(&sc->sc_tdb_mtx);
+                               pfsync_sendout();
+                               continue;
+                       }
 
-                       nlen = sizeof(struct pfsync_subheader) +
-                           sizeof(struct pfsync_tdb);
-               }
+                       TAILQ_INSERT_TAIL(&sc->sc_tdb_q, t, tdb_sync_entry);
+                       mtx_leave(&sc->sc_tdb_mtx);
 
-               sc->sc_len += nlen;
-               TAILQ_INSERT_TAIL(&sc->sc_tdb_q, t, tdb_sync_entry);
-               SET(t->tdb_flags, TDBF_PFSYNC);
-               t->tdb_updates = 0;
+                       SET(t->tdb_flags, TDBF_PFSYNC);
+                       t->tdb_updates = 0;
+               } while (0);
        } else {
                if (++t->tdb_updates >= sc->sc_maxupdates)
                        schednetisr(NETISR_PFSYNC);
@@ -2251,16 +2433,22 @@ void
 pfsync_delete_tdb(struct tdb *t)
 {
        struct pfsync_softc *sc = pfsyncif;
+       size_t nlen;
 
        if (sc == NULL || !ISSET(t->tdb_flags, TDBF_PFSYNC))
                return;
 
-       sc->sc_len -= sizeof(struct pfsync_tdb);
+       mtx_enter(&sc->sc_tdb_mtx);
+
        TAILQ_REMOVE(&sc->sc_tdb_q, t, tdb_sync_entry);
        CLR(t->tdb_flags, TDBF_PFSYNC);
 
+       nlen = sizeof(struct pfsync_tdb);
        if (TAILQ_EMPTY(&sc->sc_tdb_q))
-               sc->sc_len -= sizeof(struct pfsync_subheader);
+               nlen += sizeof(struct pfsync_subheader);
+       atomic_sub_long(&sc->sc_len, nlen);
+
+       mtx_leave(&sc->sc_tdb_mtx);
 }
 
 void
@@ -2308,9 +2496,14 @@ pfsync_bulk_start(void)
        else {
                sc->sc_ureq_received = time_uptime;
 
-               if (sc->sc_bulk_next == NULL)
+               if (sc->sc_bulk_next == NULL) {
+                       PF_STATE_ENTER_READ();
                        sc->sc_bulk_next = TAILQ_FIRST(&state_list);
+                       pf_state_ref(sc->sc_bulk_next);
+                       PF_STATE_EXIT_READ();
+               }
                sc->sc_bulk_last = sc->sc_bulk_next;
+               pf_state_ref(sc->sc_bulk_last);
 
                pfsync_bulk_status(PFSYNC_BUS_START);
                timeout_add(&sc->sc_bulk_tmo, 0);
@@ -2329,22 +2522,32 @@ pfsync_bulk_update(void *arg)
        if (sc == NULL)
                goto out;
        st = sc->sc_bulk_next;
+       sc->sc_bulk_next = NULL;
 
        for (;;) {
                if (st->sync_state == PFSYNC_S_NONE &&
                    st->timeout < PFTM_MAX &&
                    st->pfsync_time <= sc->sc_ureq_received) {
                        pfsync_update_state_req(st);
+                       pf_state_unref(st);
                        i++;
                }
 
+               /*
+                * I wonder how we prevent infinite bulk update.  IMO it can
+                * happen when sc_bulk_last state expires before we iterate
+                * through the whole list.
+                */
+               PF_STATE_ENTER_READ();
                st = TAILQ_NEXT(st, entry_list);
                if (st == NULL)
                        st = TAILQ_FIRST(&state_list);
+               pf_state_ref(st);
+               PF_STATE_EXIT_READ();
 
                if (st == sc->sc_bulk_last) {
                        /* we're done */
-                       sc->sc_bulk_next = NULL;
+                       pf_state_unref(sc->sc_bulk_last);
                        sc->sc_bulk_last = NULL;
                        pfsync_bulk_status(PFSYNC_BUS_END);
                        break;
@@ -2467,9 +2670,7 @@ void
 pfsync_timeout(void *arg)
 {
        NET_LOCK();
-       PF_LOCK();
        pfsync_sendout();
-       PF_UNLOCK();
        NET_UNLOCK();
 }
 
@@ -2477,9 +2678,7 @@ pfsync_timeout(void *arg)
 void
 pfsyncintr(void)
 {
-       PF_LOCK();
        pfsync_sendout();
-       PF_UNLOCK();
 }
 
 int
diff --git a/sys/net/if_pfsync.h b/sys/net/if_pfsync.h
index 3c3b7dd90f6..ed2dbae3dd8 100644
--- a/sys/net/if_pfsync.h
+++ b/sys/net/if_pfsync.h
@@ -328,7 +328,7 @@ void                        pfsync_state_export(struct 
pfsync_state *,
                            struct pf_state *);
 
 void                   pfsync_insert_state(struct pf_state *);
-void                   pfsync_update_state(struct pf_state *, int *);
+void                   pfsync_update_state(struct pf_state *);
 void                   pfsync_delete_state(struct pf_state *);
 void                   pfsync_clear_states(u_int32_t, const char *);
 
diff --git a/sys/net/pf.c b/sys/net/pf.c
index ebe339921fa..25c80d6b75b 100644
--- a/sys/net/pf.c
+++ b/sys/net/pf.c
@@ -6980,7 +6980,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, 
struct mbuf **m0)
                PF_STATE_EXIT_READ();
                if (action == PF_PASS || action == PF_AFRT) {
 #if NPFSYNC > 0
-                       pfsync_update_state(s, &have_pf_lock);
+                       pfsync_update_state(s);
 #endif /* NPFSYNC > 0 */
                        r = s->rule.ptr;
                        a = s->anchor.ptr;
@@ -7012,7 +7012,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, 
struct mbuf **m0)
                PF_STATE_EXIT_READ();
                if (action == PF_PASS || action == PF_AFRT) {
 #if NPFSYNC > 0
-                       pfsync_update_state(s, &have_pf_lock);
+                       pfsync_update_state(s);
 #endif /* NPFSYNC > 0 */
                        r = s->rule.ptr;
                        a = s->anchor.ptr;
@@ -7088,7 +7088,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, 
struct mbuf **m0)
 
                if (action == PF_PASS || action == PF_AFRT) {
 #if NPFSYNC > 0
-                       pfsync_update_state(s, &have_pf_lock);
+                       pfsync_update_state(s);
 #endif /* NPFSYNC > 0 */
                        r = s->rule.ptr;
                        a = s->anchor.ptr;

Reply via email to