On Wed, May 19, 2021 at 01:48:26AM +0200, Alexandr Nedvedicky wrote:
> Hello,
> 
> just for the record...
> 
> </snip>
> 
> > > in current tree the ether_input() is protected by NET_LOCK(), which is 
> > > grabbed
> > > by caller as a writer. bluhm's diff changes NET_LOCK() readlock, so
> > > ether_input() can run concurrently. Switching NET_LOCK() to r-lock has
> > > implications on smr read section above. The ting is the call to 
> > > eb->eb_input()
> > > can sleep now. This is something what needs to be avoided within smr 
> > > section.
> > 
> > Is the new sleeping point introduced by the fact the PF_LOCK() is a
> > rwlock?  Did you consider using a mutex, at least for the time being,
> > in order to not run in such issues?
> 
>     below is a diff, which trades both pf(4) rw-locks for mutexes.
> 
>     diff compiles, it still needs testing/experimenting.

hi. i'm trying to get my head back into this space, so i'm trying to
have a look at these diffs properly.

moving pf locks to mutexes makes sense to me, but like you say,
this will need testing and experimentation. one of the issues
identified in another part of this thread (and on icb?) is that the
ioctl path does some stuff which can sleep, but you can't sleep
while holding a mutex which would now be protecting the data
structures you're trying to look at.

a more specific example is if you're trying to dump the state table.
like you say, the state table is currently protected by the
pf_state_lock. iterating over the state table and giving up this lock to
copyout the entries is... not fun.

the diff below is a work in progress attempt to address this. it works
by locking the pf state list separately so it can support traversal
without (long) holds of locks needed in the forwarding path. the very
short explanation is that the TAILQ holding the states is locked
separately to the links between the states in the list. a much longer
explanataion is in the diff in the pfvar_priv.h chunk.

inserting states into the TAILQ while processing packets uses a mutex.
iterating over the list in the pf purge processing, ioctl paths, and
pfsync can largely rely on an rwlock. the rwlock would also allow
concurrent access to the list of states, ie, you can dump the list of
states while the gc thread is looking for states to collect, also while
pfsync is sending out a bulk update. it also converts the ioctl handling
for flushing states to using the list instead of one of the RB trees
holding states. i'm sure everyone wants an omgoptimised state flushing
mechanism.

this is a work in progress because i've screwed up the pf_state
reference counting somehow. states end up with an extra reference, and i
can't see where that's coming from.

anyway. apart from the refcounting thing it seems to be working well,
and would take us a step closer to using mutexes for pf locks.

even if we throw this away, id still like to move the pf purge
processing from out of nettq0 into systq, just to avoid having a place
where a nettq thread has to spin waiting for the kernel lock.

Index: pf.c
===================================================================
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.1118
diff -u -p -r1.1118 pf.c
--- pf.c        1 Jun 2021 09:57:11 -0000       1.1118
+++ pf.c        2 Jun 2021 07:52:21 -0000
@@ -308,7 +308,7 @@ static __inline void pf_set_protostate(s
 struct pf_src_tree tree_src_tracking;
 
 struct pf_state_tree_id tree_id;
-struct pf_state_queue state_list;
+struct pf_state_list pf_state_list = PF_STATE_LIST_INITIALIZER(pf_state_list);
 
 RB_GENERATE(pf_src_tree, pf_src_node, entry, pf_src_compare);
 RB_GENERATE(pf_state_tree, pf_state_key, entry, pf_state_compare_key);
@@ -440,6 +440,37 @@ pf_check_threshold(struct pf_threshold *
        return (threshold->count > threshold->limit);
 }
 
+void
+pf_state_list_insert(struct pf_state_list *pfs, struct pf_state *st)
+{
+       /*
+        * we can always put states on the end of the list.
+        *
+        * things reading the list should take a read lock, then
+        * the mutex, get the head and tail pointers, release the
+        * mutex, and then they can iterate between the head and tail.
+        */
+
+       pf_state_ref(st); /* get a ref for the list */
+
+       mtx_enter(&pfs->pfs_mtx);
+       TAILQ_INSERT_TAIL(&pfs->pfs_list, st, entry_list);
+       mtx_leave(&pfs->pfs_mtx);
+}
+
+void
+pf_state_list_remove(struct pf_state_list *pfs, struct pf_state *st)
+{
+       /* states can only be removed when the write lock is held */
+       rw_assert_wrlock(&pfs->pfs_rwl);
+
+       mtx_enter(&pfs->pfs_mtx);
+       TAILQ_REMOVE(&pfs->pfs_list, st, entry_list);
+       mtx_leave(&pfs->pfs_mtx);
+
+       pf_state_unref(st); /* list no longer references the state */
+}
+
 int
 pf_src_connlimit(struct pf_state **state)
 {
@@ -986,7 +1017,7 @@ pf_state_insert(struct pfi_kif *kif, str
                PF_STATE_EXIT_WRITE();
                return (-1);
        }
-       TAILQ_INSERT_TAIL(&state_list, s, entry_list);
+       pf_state_list_insert(&pf_state_list, s);
        pf_status.fcounters[FCNT_STATE_INSERT]++;
        pf_status.states++;
        pfi_kif_ref(kif, PFI_KIF_REF_STATE);
@@ -1247,7 +1278,8 @@ pf_purge_expired_rules(void)
 void
 pf_purge_timeout(void *unused)
 {
-       task_add(net_tq(0), &pf_purge_task);
+       /* XXX move to systqmp to avoid KERNEL_LOCK */
+       task_add(systq, &pf_purge_task);
 }
 
 void
@@ -1255,7 +1287,6 @@ pf_purge(void *xnloops)
 {
        int *nloops = xnloops;
 
-       KERNEL_LOCK();
        NET_LOCK();
 
        /*
@@ -1284,7 +1315,6 @@ pf_purge(void *xnloops)
                *nloops = 0;
        }
        NET_UNLOCK();
-       KERNEL_UNLOCK();
 
        timeout_add_sec(&pf_purge_to, 1);
 }
@@ -1447,7 +1477,7 @@ pf_free_state(struct pf_state *cur)
        }
        pf_normalize_tcp_cleanup(cur);
        pfi_kif_unref(cur->kif, PFI_KIF_REF_STATE);
-       TAILQ_REMOVE(&state_list, cur, entry_list);
+       pf_state_list_remove(&pf_state_list, cur);
        if (cur->tag)
                pf_tag_unref(cur->tag);
        pf_state_unref(cur);
@@ -1458,53 +1488,80 @@ pf_free_state(struct pf_state *cur)
 void
 pf_purge_expired_states(u_int32_t maxcheck)
 {
+       /*
+        * this task/thread/context/whatever is the only thing that
+        * removes states from the pf_state_list, so the cur reference
+        * it holds between calls is guaranteed to still be in the
+        * list.
+        */
        static struct pf_state  *cur = NULL;
-       struct pf_state         *next;
-       SLIST_HEAD(pf_state_gcl, pf_state) gcl;
+
+       struct pf_state         *head, *tail;
+       struct pf_state         *st;
+       SLIST_HEAD(pf_state_gcl, pf_state) gcl = SLIST_HEAD_INITIALIZER(gcl);
+       unsigned int gcn = 0;
 
        PF_ASSERT_UNLOCKED();
-       SLIST_INIT(&gcl);
 
-       PF_STATE_ENTER_READ();
-       while (maxcheck--) {
-               /* wrap to start of list when we hit the end */
-               if (cur == NULL) {
-                       cur = pf_state_ref(TAILQ_FIRST(&state_list));
-                       if (cur == NULL)
-                               break;  /* list empty */
-               }
+       rw_enter_read(&pf_state_list.pfs_rwl);
 
-               /* get next state, as cur may get deleted */
-               next = TAILQ_NEXT(cur, entry_list);
+       mtx_enter(&pf_state_list.pfs_mtx);
+       head = TAILQ_FIRST(&pf_state_list.pfs_list);
+       tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
+       mtx_leave(&pf_state_list.pfs_mtx);
+
+       if (head == NULL) {
+               /* the list is empty */
+               rw_exit_read(&pf_state_list.pfs_rwl);
+               return;
+       }
 
-               if ((cur->timeout == PFTM_UNLINKED) ||
-                   (pf_state_expires(cur) <= getuptime()))
-                       SLIST_INSERT_HEAD(&gcl, cur, gc_list);
-               else
-                       pf_state_unref(cur);
+       /* (re)start at the front of the list */
+       if (cur == NULL)
+               cur = head;
 
-               cur = pf_state_ref(next);
+       do {
+               if ((cur->timeout == PFTM_UNLINKED) ||
+                   (pf_state_expires(cur) <= getuptime())) {
+                       SLIST_INSERT_HEAD(&gcl, pf_state_ref(cur), gc_list);
+                       gcn++;
+               }
 
-               if (cur == NULL)
+               /* don't iterate past the end of our view of the list */
+               if (cur == tail) {
+                       cur = NULL;
                        break;
-       }
-       PF_STATE_EXIT_READ();
+               }
+
+               cur = TAILQ_NEXT(cur, entry_list);
+       } while (maxcheck--);
 
+       rw_exit_read(&pf_state_list.pfs_rwl);
+
+       if (SLIST_EMPTY(&gcl))
+               return;
+
+       printf("%s: purging %u states\n", __func__, gcn);
+
+       rw_enter_write(&pf_state_list.pfs_rwl);
        PF_LOCK();
        PF_STATE_ENTER_WRITE();
-       while ((next = SLIST_FIRST(&gcl)) != NULL) {
-               SLIST_REMOVE_HEAD(&gcl, gc_list);
-               if (next->timeout == PFTM_UNLINKED)
-                       pf_free_state(next);
-               else {
-                       pf_remove_state(next);
-                       pf_free_state(next);
-               }
+       SLIST_FOREACH(st, &gcl, gc_list) {
+               if (st->timeout != PFTM_UNLINKED)
+                       pf_remove_state(st);
 
-               pf_state_unref(next);
+               pf_free_state(st);
        }
        PF_STATE_EXIT_WRITE();
        PF_UNLOCK();
+       rw_exit_write(&pf_state_list.pfs_rwl);
+
+       while ((st = SLIST_FIRST(&gcl)) != NULL) {
+               SLIST_REMOVE_HEAD(&gcl, gc_list);
+printf("%s: state %p refs %u\n", __func__, st, st->refcnt.refs);
+               pf_state_unref(st);
+               pf_state_unref(st); /* XXX */
+       }
 }
 
 int
Index: pf_ioctl.c
===================================================================
RCS file: /cvs/src/sys/net/pf_ioctl.c,v
retrieving revision 1.364
diff -u -p -r1.364 pf_ioctl.c
--- pf_ioctl.c  2 Jun 2021 07:46:22 -0000       1.364
+++ pf_ioctl.c  2 Jun 2021 07:52:22 -0000
@@ -113,6 +113,8 @@ int                  pf_rule_copyin(struct pf_rule *, 
 u_int16_t               pf_qname2qid(char *, int);
 void                    pf_qid2qname(u_int16_t, char *);
 void                    pf_qid_unref(u_int16_t);
+int                     pf_states_clr(struct pfioc_state_kill *);
+int                     pf_states_get(struct pfioc_states *);
 
 struct pf_rule          pf_default_rule, pf_default_rule_new;
 
@@ -206,7 +208,6 @@ pfattach(int num)
        TAILQ_INIT(&pf_queues[1]);
        pf_queues_active = &pf_queues[0];
        pf_queues_inactive = &pf_queues[1];
-       TAILQ_INIT(&state_list);
 
        /* default rule should never be garbage collected */
        pf_default_rule.entries.tqe_prev = &pf_default_rule.entries.tqe_next;
@@ -903,6 +904,122 @@ pf_addr_copyout(struct pf_addr_wrap *add
 }
 
 int
+pf_states_clr(struct pfioc_state_kill *psk)
+{
+       struct pf_state         *s, *nexts;
+       struct pf_state         *head, *tail;
+       u_int                    killed = 0;
+       int                      error;
+
+       NET_LOCK();
+
+       /* lock against the gc removing an item from the list */
+       error = rw_enter(&pf_state_list.pfs_rwl, RW_READ|RW_INTR);
+       if (error != 0)
+               goto unlock;
+
+       /* get a snapshot view of the ends of the list to traverse between */
+       mtx_enter(&pf_state_list.pfs_mtx);
+       head = TAILQ_FIRST(&pf_state_list.pfs_list);
+       tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
+       mtx_leave(&pf_state_list.pfs_mtx);
+
+       s = NULL;
+       nexts = head;
+
+       PF_LOCK();
+       PF_STATE_ENTER_WRITE();
+
+       while (s != tail) {
+               s = nexts;
+               nexts = TAILQ_NEXT(s, entry_list);
+
+               if (s->timeout == PFTM_UNLINKED)
+                       continue;
+
+               if (!psk->psk_ifname[0] || !strcmp(psk->psk_ifname,
+                   s->kif->pfik_name)) {
+#if NPFSYNC > 0
+                       /* don't send out individual delete messages */
+                       SET(s->state_flags, PFSTATE_NOSYNC);
+#endif /* NPFSYNC > 0 */
+                       pf_remove_state(s);
+                       killed++;
+               }
+       }
+
+       PF_STATE_EXIT_WRITE();
+#if NPFSYNC > 0
+       pfsync_clear_states(pf_status.hostid, psk->psk_ifname);
+#endif /* NPFSYNC > 0 */
+       PF_UNLOCK();
+       rw_exit(&pf_state_list.pfs_rwl);
+
+       psk->psk_killed = killed;
+unlock:
+       NET_UNLOCK();
+
+       return (error);
+}
+
+int
+pf_states_get(struct pfioc_states *ps)
+{
+       struct pf_state         *head, *tail;
+       struct pf_state         *next, *state;
+       struct pfsync_state     *p, pstore;
+       u_int32_t                nr = 0;
+       int                      error;
+
+       if (ps->ps_len == 0) {
+               nr = pf_status.states;
+               ps->ps_len = sizeof(struct pfsync_state) * nr;
+               return (0);
+       }
+
+       p = ps->ps_states;
+
+       /* lock against the gc removing an item from the list */
+       error = rw_enter(&pf_state_list.pfs_rwl, RW_READ|RW_INTR);
+       if (error != 0)
+               return (error);
+
+       /* get a snapshot view of the ends of the list to traverse between */
+       mtx_enter(&pf_state_list.pfs_mtx);
+       head = TAILQ_FIRST(&pf_state_list.pfs_list);
+       tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
+       mtx_leave(&pf_state_list.pfs_mtx);
+
+       state = NULL;
+       next = head;
+
+       while (state != tail) {
+               state = next;
+               next = TAILQ_NEXT(state, entry_list);
+
+               if (state->timeout == PFTM_UNLINKED)
+                       continue;
+
+               if ((nr+1) * sizeof(*p) > ps->ps_len)
+                       break;
+
+               pf_state_export(&pstore, state);
+               error = copyout(&pstore, p, sizeof(*p));
+               if (error)
+                       goto fail;
+
+               p++;
+               nr++;
+       }
+       ps->ps_len = sizeof(struct pfsync_state) * nr;
+
+fail:
+       rw_exit(&pf_state_list.pfs_rwl);
+
+       return (error);
+}
+
+int
 pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p)
 {
        int                      error = 0;
@@ -1545,36 +1662,9 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
                break;
        }
 
-       case DIOCCLRSTATES: {
-               struct pf_state         *s, *nexts;
-               struct pfioc_state_kill *psk = (struct pfioc_state_kill *)addr;
-               u_int                    killed = 0;
-
-               NET_LOCK();
-               PF_LOCK();
-               PF_STATE_ENTER_WRITE();
-               for (s = RB_MIN(pf_state_tree_id, &tree_id); s; s = nexts) {
-                       nexts = RB_NEXT(pf_state_tree_id, &tree_id, s);
-
-                       if (!psk->psk_ifname[0] || !strcmp(psk->psk_ifname,
-                           s->kif->pfik_name)) {
-#if NPFSYNC > 0
-                               /* don't send out individual delete messages */
-                               SET(s->state_flags, PFSTATE_NOSYNC);
-#endif /* NPFSYNC > 0 */
-                               pf_remove_state(s);
-                               killed++;
-                       }
-               }
-               PF_STATE_EXIT_WRITE();
-               psk->psk_killed = killed;
-#if NPFSYNC > 0
-               pfsync_clear_states(pf_status.hostid, psk->psk_ifname);
-#endif /* NPFSYNC > 0 */
-               PF_UNLOCK();
-               NET_UNLOCK();
+       case DIOCCLRSTATES:
+               error = pf_states_clr((struct pfioc_state_kill *)addr);
                break;
-       }
 
        case DIOCKILLSTATES: {
                struct pf_state         *s, *nexts;
@@ -1757,50 +1847,9 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
                break;
        }
 
-       case DIOCGETSTATES: {
-               struct pfioc_states     *ps = (struct pfioc_states *)addr;
-               struct pf_state         *state;
-               struct pfsync_state     *p, *pstore;
-               u_int32_t                nr = 0;
-
-               if (ps->ps_len == 0) {
-                       nr = pf_status.states;
-                       ps->ps_len = sizeof(struct pfsync_state) * nr;
-                       break;
-               }
-
-               pstore = malloc(sizeof(*pstore), M_TEMP, M_WAITOK);
-
-               p = ps->ps_states;
-
-               NET_LOCK();
-               PF_STATE_ENTER_READ();
-               state = TAILQ_FIRST(&state_list);
-               while (state) {
-                       if (state->timeout != PFTM_UNLINKED) {
-                               if ((nr+1) * sizeof(*p) > ps->ps_len)
-                                       break;
-                               pf_state_export(pstore, state);
-                               error = copyout(pstore, p, sizeof(*p));
-                               if (error) {
-                                       free(pstore, M_TEMP, sizeof(*pstore));
-                                       PF_STATE_EXIT_READ();
-                                       NET_UNLOCK();
-                                       goto fail;
-                               }
-                               p++;
-                               nr++;
-                       }
-                       state = TAILQ_NEXT(state, entry_list);
-               }
-               PF_STATE_EXIT_READ();
-               NET_UNLOCK();
-
-               ps->ps_len = sizeof(struct pfsync_state) * nr;
-
-               free(pstore, M_TEMP, sizeof(*pstore));
+       case DIOCGETSTATES: 
+               error = pf_states_get((struct pfioc_states *)addr);
                break;
-       }
 
        case DIOCGETSTATUS: {
                struct pf_status *s = (struct pf_status *)addr;
Index: pfvar.h
===================================================================
RCS file: /cvs/src/sys/net/pfvar.h,v
retrieving revision 1.500
diff -u -p -r1.500 pfvar.h
--- pfvar.h     10 Mar 2021 10:21:48 -0000      1.500
+++ pfvar.h     2 Jun 2021 07:52:22 -0000
@@ -1682,7 +1682,7 @@ RB_HEAD(pf_state_tree_id, pf_state);
 RB_PROTOTYPE(pf_state_tree_id, pf_state,
     entry_id, pf_state_compare_id);
 extern struct pf_state_tree_id tree_id;
-extern struct pf_state_queue state_list;
+extern struct pf_state_list pf_state_list;
 
 TAILQ_HEAD(pf_queuehead, pf_queuespec);
 extern struct pf_queuehead               pf_queues[2];
Index: pfvar_priv.h
===================================================================
RCS file: /cvs/src/sys/net/pfvar_priv.h,v
retrieving revision 1.6
diff -u -p -r1.6 pfvar_priv.h
--- pfvar_priv.h        9 Feb 2021 14:06:19 -0000       1.6
+++ pfvar_priv.h        2 Jun 2021 07:52:22 -0000
@@ -38,6 +38,115 @@
 #ifdef _KERNEL
 
 #include <sys/rwlock.h>
+#include <sys/mutex.h>
+
+/*
+
+states are linked into a global list to support the following
+functionality:
+
+ - garbage collection
+ - pfsync bulk send operations
+ - bulk state fetches via the DIOCGETSTATES ioctl
+ - bulk state clearing via the DIOCCLRSTATES ioctl
+
+states are inserted into the global pf_state_list once it has also
+been successfully added to the various trees that make up the state
+table. states are only removed from the pf_state_list by the garbage
+collection process.
+
+the pf_state_list head and tail pointers (ie, the pfs_list TAILQ_HEAD
+structure) and the pointers between the entries on the pf_state_list
+are locked separately. at a high level, this allows for insertion
+of new states into the pf_state_list while other contexts (eg, the
+ioctls) are traversing the state items in the list. for garbage
+collection to remove items from the pf_state_list, it has to exclude
+both modifications to the list head and tail pointers, and traversal
+of the links between the states.
+
+the head and tail pointers are protected by a mutex. the pointers
+between states are protected by an rwlock.
+
+because insertions are only made to the end of the list, if we get
+a snapshot of the head and tail of the list and prevent modifications
+to the links between states, we can safely traverse between the
+head and tail entries. subsequent insertions can add entries after
+our view of the tail, but we don't look past our view.
+
+if both locks must be taken, the rwlock protecting the links between
+states is taken before the mutex protecting the head and tail
+pointer.
+
+insertion into the list follows this pattern:
+
+       // serialise list head/tail modifications
+       mtx_enter(&pf_state_list.pfs_mtx);
+       TAILQ_INSERT_TAIL(&pf_state_list.pfs_list, state, entry_list);
+       mtx_leave(&pf_state_list.pfs_mtx);
+
+traversal of the list:
+
+       // lock against the gc removing an item from the list
+       rw_enter_read(&pf_state_list.pfs_mtx);
+
+       // get a snapshot view of the ends of the list
+       mtx_enter(&pf_state_list.pfs_mtx);
+       head = TAILQ_FIRST(&pf_state_list.pfs_list);
+       tail = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
+       mtx_leave(&pf_state_list.pfs_mtx);
+
+       state = NULL;
+       next = head;
+
+       while (state != tail) {
+               state = next;
+               next = TAILQ_NEXT(state, entry_list);
+
+               // look at the state
+       }
+
+       rw_exit_read(&pf_state_list.pfs_mtx);
+
+removing an item from the list:
+
+       // wait for iterators (readers) to get out
+       rw_enter_write(&pf_state_list.pfs_mtx);
+
+       // serialise list head/tail modifications
+       mtx_enter(&pf_state_list.pfs_mtx);
+       head = TAILQ_REMOVE(&pf_state_list.pfs_list, entry_list);
+       mtx_leave(&pf_state_list.pfs_mtx);
+
+       rw_exit_write(&pf_state_list.pfs_mtx);
+
+the lock ordering for pf_state_list locks and the rest of the pf
+locks are:
+
+1. KERNEL_LOCK
+2. NET_LOCK
+3. pf_state_list.pfs_rwl
+4. PF_LOCK
+5. PF_STATE_LOCK
+6. pf_state_list.pfs_mtx
+
+*/
+
+struct pf_state_list {
+       /* the actual list of states in the system */
+       struct pf_state_queue           pfs_list;
+
+       /* serialises insertion of states in the list */
+       struct mutex                    pfs_mtx;
+
+       /* coordinate access to the list for deletion */
+       struct rwlock                   pfs_rwl;
+};
+
+#define PF_STATE_LIST_INITIALIZER(_pfs) {                              \
+       .pfs_list       = TAILQ_HEAD_INITIALIZER(_pfs.pfs_list),        \
+       .pfs_mtx        = MUTEX_INITIALIZER(IPL_SOFTNET),               \
+       .pfs_rwl        = RWLOCK_INITIALIZER("pfstates"),               \
+}
 
 extern struct rwlock pf_lock;
 
Index: if_pfsync.c
===================================================================
RCS file: /cvs/src/sys/net/if_pfsync.c,v
retrieving revision 1.288
diff -u -p -r1.288 if_pfsync.c
--- if_pfsync.c 10 Mar 2021 10:21:48 -0000      1.288
+++ if_pfsync.c 2 Jun 2021 07:52:22 -0000
@@ -2509,22 +2509,34 @@ pfsync_bulk_start(void)
 {
        struct pfsync_softc *sc = pfsyncif;
 
+       NET_ASSERT_LOCKED();
+
+       /*
+        * pf gc via pfsync_state_in_use reads sc_bulk_next and
+        * sc_bulk_last while exclusively holding the pf_state_list
+        * rwlock. make sure it can't race with us setting these
+        * pointers. they basically act as hazards, and borrow the
+        * lists state reference count.
+        */
+       rw_enter_read(&pf_state_list.pfs_rwl);
+
+       /* get a consistent view of the list pointers */
+       mtx_enter(&pf_state_list.pfs_mtx);
+       if (sc->sc_bulk_next == NULL)
+               sc->sc_bulk_next = TAILQ_FIRST(&pf_state_list.pfs_list);
+
+       sc->sc_bulk_last = TAILQ_LAST(&pf_state_list.pfs_list, pf_state_queue);
+       mtx_leave(&pf_state_list.pfs_mtx);
+
+       rw_exit_read(&pf_state_list.pfs_rwl);
+
        DPFPRINTF(LOG_INFO, "received bulk update request");
 
-       if (TAILQ_EMPTY(&state_list))
+       if (sc->sc_bulk_last == NULL)
                pfsync_bulk_status(PFSYNC_BUS_END);
        else {
                sc->sc_ureq_received = getuptime();
 
-               if (sc->sc_bulk_next == NULL) {
-                       PF_STATE_ENTER_READ();
-                       sc->sc_bulk_next = TAILQ_FIRST(&state_list);
-                       pf_state_ref(sc->sc_bulk_next);
-                       PF_STATE_EXIT_READ();
-               }
-               sc->sc_bulk_last = sc->sc_bulk_next;
-               pf_state_ref(sc->sc_bulk_last);
-
                pfsync_bulk_status(PFSYNC_BUS_START);
                timeout_add(&sc->sc_bulk_tmo, 0);
        }
@@ -2534,13 +2546,15 @@ void
 pfsync_bulk_update(void *arg)
 {
        struct pfsync_softc *sc;
-       struct pf_state *st, *st_next;
+       struct pf_state *st;
        int i = 0;
 
        NET_LOCK();
        sc = pfsyncif;
        if (sc == NULL)
                goto out;
+
+       rw_enter_read(&pf_state_list.pfs_rwl);
        st = sc->sc_bulk_next;
        sc->sc_bulk_next = NULL;
 
@@ -2552,23 +2566,9 @@ pfsync_bulk_update(void *arg)
                        i++;
                }
 
-               /*
-                * I wonder how we prevent infinite bulk update.  IMO it can
-                * happen when sc_bulk_last state expires before we iterate
-                * through the whole list.
-                */
-               PF_STATE_ENTER_READ();
-               st_next = TAILQ_NEXT(st, entry_list);
-               pf_state_unref(st);
-               st = st_next;
-               if (st == NULL)
-                       st = TAILQ_FIRST(&state_list);
-               pf_state_ref(st);
-               PF_STATE_EXIT_READ();
-
+               st = TAILQ_NEXT(st, entry_list);
                if ((st == NULL) || (st == sc->sc_bulk_last)) {
                        /* we're done */
-                       pf_state_unref(sc->sc_bulk_last);
                        sc->sc_bulk_last = NULL;
                        pfsync_bulk_status(PFSYNC_BUS_END);
                        break;
@@ -2582,6 +2582,8 @@ pfsync_bulk_update(void *arg)
                        break;
                }
        }
+
+       rw_exit_read(&pf_state_list.pfs_rwl);
  out:
        NET_UNLOCK();
 }
@@ -2678,6 +2680,8 @@ pfsync_state_in_use(struct pf_state *st)
 
        if (sc == NULL)
                return (0);
+
+       rw_assert_wrlock(&pf_state_list.pfs_rwl);
 
        if (st->sync_state != PFSYNC_S_NONE ||
            st == sc->sc_bulk_next ||

Reply via email to