On Thu, Nov 24, 2022 at 11:23:51AM +1000, David Gwynne wrote:
> > we're working toward dropping the need for NET_LOCK before PF_LOCK. could
> > we try the diff below as a compromise?
> >
> 
> sashan@ and mvs@ have pushed that forward, so this diff should be enough
> now.

This diff has been reverted due to netlock splassert failures.

So I would like to revert the origin of my deadlock.  Move pf_purge
back to systq.

ok?

bluhm

Index: net/pf.c
===================================================================
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.1155
diff -u -p -r1.1155 pf.c
--- net/pf.c    25 Nov 2022 18:03:53 -0000      1.1155
+++ net/pf.c    25 Nov 2022 18:33:08 -0000
@@ -120,6 +120,10 @@ u_char                      pf_tcp_secret[16];
 int                     pf_tcp_secret_init;
 int                     pf_tcp_iss_off;
 
+int             pf_npurge;
+struct task     pf_purge_task = TASK_INITIALIZER(pf_purge, &pf_npurge);
+struct timeout  pf_purge_to = TIMEOUT_INITIALIZER(pf_purge_timeout, NULL);
+
 enum pf_test_status {
        PF_TEST_FAIL = -1,
        PF_TEST_OK,
@@ -1516,110 +1520,47 @@ pf_state_import(const struct pfsync_stat
 
 /* END state table stuff */
 
-void            pf_purge_states(void *);
-struct task     pf_purge_states_task =
-                    TASK_INITIALIZER(pf_purge_states, NULL);
-
-void            pf_purge_states_tick(void *);
-struct timeout  pf_purge_states_to =
-                    TIMEOUT_INITIALIZER(pf_purge_states_tick, NULL);
-
-unsigned int    pf_purge_expired_states(unsigned int, unsigned int);
-
-/*
- * how many states to scan this interval.
- *
- * this is set when the timeout fires, and reduced by the task. the
- * task will reschedule itself until the limit is reduced to zero,
- * and then it adds the timeout again.
- */
-unsigned int pf_purge_states_limit;
-
-/*
- * limit how many states are processed with locks held per run of
- * the state purge task.
- */
-unsigned int pf_purge_states_collect = 64;
-
 void
-pf_purge_states_tick(void *null)
+pf_purge_timeout(void *unused)
 {
-       unsigned int limit = pf_status.states;
-       unsigned int interval = pf_default_rule.timeout[PFTM_INTERVAL];
-
-       if (limit == 0) {
-               timeout_add_sec(&pf_purge_states_to, 1);
-               return;
-       }
-
-       /*
-        * process a fraction of the state table every second
-        */
-
-       if (interval > 1)
-               limit /= interval;
-
-       pf_purge_states_limit = limit;
-       task_add(systqmp, &pf_purge_states_task);
-}
-
-void
-pf_purge_states(void *null)
-{
-       unsigned int limit;
-       unsigned int scanned;
-
-       limit = pf_purge_states_limit;
-       if (limit < pf_purge_states_collect)
-               limit = pf_purge_states_collect;
-
-       scanned = pf_purge_expired_states(limit, pf_purge_states_collect);
-       if (scanned >= pf_purge_states_limit) {
-               /* we've run out of states to scan this "interval" */
-               timeout_add_sec(&pf_purge_states_to, 1);
-               return;
-       }
-
-       pf_purge_states_limit -= scanned;
-       task_add(systqmp, &pf_purge_states_task);
+       /* XXX move to systqmp to avoid KERNEL_LOCK */
+       task_add(systq, &pf_purge_task);
 }
 
-void            pf_purge_tick(void *);
-struct timeout  pf_purge_to =
-                    TIMEOUT_INITIALIZER(pf_purge_tick, NULL);
-
-void            pf_purge(void *);
-struct task     pf_purge_task =
-                    TASK_INITIALIZER(pf_purge, NULL);
-
 void
-pf_purge_tick(void *null)
+pf_purge(void *xnloops)
 {
-       task_add(systqmp, &pf_purge_task);
-}
+       int *nloops = xnloops;
 
-void
-pf_purge(void *null)
-{
-       unsigned int interval = max(1, pf_default_rule.timeout[PFTM_INTERVAL]);
+       /*
+        * process a fraction of the state table every second
+        * Note:
+        *     we no longer need PF_LOCK() here, because
+        *     pf_purge_expired_states() uses pf_state_lock to maintain
+        *     consistency.
+        */
+       if (pf_default_rule.timeout[PFTM_INTERVAL] > 0)
+               pf_purge_expired_states(1 + (pf_status.states
+                   / pf_default_rule.timeout[PFTM_INTERVAL]));
 
-       /* XXX is NET_LOCK necessary? */
        NET_LOCK();
 
        PF_LOCK();
-
-       pf_purge_expired_src_nodes();
-
+       /* purge other expired types every PFTM_INTERVAL seconds */
+       if (++(*nloops) >= pf_default_rule.timeout[PFTM_INTERVAL])
+               pf_purge_expired_src_nodes();
        PF_UNLOCK();
 
        /*
         * Fragments don't require PF_LOCK(), they use their own lock.
         */
-       pf_purge_expired_fragments();
+       if ((*nloops) >= pf_default_rule.timeout[PFTM_INTERVAL]) {
+               pf_purge_expired_fragments();
+               *nloops = 0;
+       }
        NET_UNLOCK();
 
-       /* interpret the interval as idle time between runs */
-       timeout_add_sec(&pf_purge_to, interval);
+       timeout_add_sec(&pf_purge_to, 1);
 }
 
 int32_t
@@ -1819,8 +1760,8 @@ pf_free_state(struct pf_state *cur)
        pf_status.states--;
 }
 
-unsigned int
-pf_purge_expired_states(const unsigned int limit, const unsigned int collect)
+void
+pf_purge_expired_states(u_int32_t maxcheck)
 {
        /*
         * this task/thread/context/whatever is the only thing that
@@ -1834,8 +1775,6 @@ pf_purge_expired_states(const unsigned i
        struct pf_state         *st;
        SLIST_HEAD(pf_state_gcl, pf_state) gcl = SLIST_HEAD_INITIALIZER(gcl);
        time_t                   now;
-       unsigned int             scanned;
-       unsigned int             collected = 0;
 
        PF_ASSERT_UNLOCKED();
 
@@ -1849,7 +1788,7 @@ pf_purge_expired_states(const unsigned i
        if (head == NULL) {
                /* the list is empty */
                rw_exit_read(&pf_state_list.pfs_rwl);
-               return (limit);
+               return;
        }
 
        /* (re)start at the front of the list */
@@ -1858,38 +1797,28 @@ pf_purge_expired_states(const unsigned i
 
        now = getuptime();
 
-       for (scanned = 0; scanned < limit; scanned++) {
+       do {
                uint8_t stimeout = cur->timeout;
-               unsigned int limited = 0;
 
                if ((stimeout == PFTM_UNLINKED) ||
                    (pf_state_expires(cur, stimeout) <= now)) {
                        st = pf_state_ref(cur);
                        SLIST_INSERT_HEAD(&gcl, st, gc_list);
-
-                       if (++collected >= collect)
-                               limited = 1;
                }
 
                /* don't iterate past the end of our view of the list */
                if (cur == tail) {
-                       scanned = limit;
                        cur = NULL;
                        break;
                }
 
                cur = TAILQ_NEXT(cur, entry_list);
-
-               /* don't spend too much time here. */
-               if (ISSET(READ_ONCE(curcpu()->ci_schedstate.spc_schedflags),
-                    SPCF_SHOULDYIELD) || limited)
-                       break;
-       }
+       } while (maxcheck--);
 
        rw_exit_read(&pf_state_list.pfs_rwl);
 
        if (SLIST_EMPTY(&gcl))
-               return (scanned);
+               return;
 
        NET_LOCK();
        rw_enter_write(&pf_state_list.pfs_rwl);
@@ -1910,8 +1839,6 @@ pf_purge_expired_states(const unsigned i
                SLIST_REMOVE_HEAD(&gcl, gc_list);
                pf_state_unref(st);
        }
-
-       return (scanned);
 }
 
 int
Index: net/pf_ioctl.c
===================================================================
RCS file: /cvs/src/sys/net/pf_ioctl.c,v
retrieving revision 1.391
diff -u -p -r1.391 pf_ioctl.c
--- net/pf_ioctl.c      11 Nov 2022 16:12:08 -0000      1.391
+++ net/pf_ioctl.c      25 Nov 2022 18:33:08 -0000
@@ -1145,7 +1145,6 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
                                pf_status.stateid = gettime();
                                pf_status.stateid = pf_status.stateid << 32;
                        }
-                       timeout_add_sec(&pf_purge_states_to, 1);
                        timeout_add_sec(&pf_purge_to, 1);
                        pf_create_queues();
                        DPFPRINTF(LOG_NOTICE, "pf: started");
@@ -2741,9 +2740,8 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a
                        pf_default_rule.timeout[i] =
                            pf_default_rule_new.timeout[i];
                        if (pf_default_rule.timeout[i] == PFTM_INTERVAL &&
-                           pf_default_rule.timeout[i] < old &&
-                           timeout_del(&pf_purge_to))
-                               task_add(systqmp, &pf_purge_task);
+                           pf_default_rule.timeout[i] < old)
+                               task_add(net_tq(0), &pf_purge_task);
                }
                pfi_xcommit();
                pf_trans_set_commit();
Index: net/pfvar.h
===================================================================
RCS file: /cvs/src/sys/net/pfvar.h,v
retrieving revision 1.520
diff -u -p -r1.520 pfvar.h
--- net/pfvar.h 11 Nov 2022 16:12:08 -0000      1.520
+++ net/pfvar.h 25 Nov 2022 18:33:08 -0000
@@ -1633,6 +1633,7 @@ extern void                        
pf_tbladdr_remove(struct 
 extern void                     pf_tbladdr_copyout(struct pf_addr_wrap *);
 extern void                     pf_calc_skip_steps(struct pf_rulequeue *);
 extern void                     pf_purge_expired_src_nodes(void);
+extern void                     pf_purge_expired_states(u_int32_t);
 extern void                     pf_purge_expired_rules(void);
 extern void                     pf_remove_state(struct pf_state *);
 extern void                     pf_remove_divert_state(struct pf_state_key *);
Index: net/pfvar_priv.h
===================================================================
RCS file: /cvs/src/sys/net/pfvar_priv.h,v
retrieving revision 1.22
diff -u -p -r1.22 pfvar_priv.h
--- net/pfvar_priv.h    24 Nov 2022 00:04:32 -0000      1.22
+++ net/pfvar_priv.h    25 Nov 2022 18:33:08 -0000
@@ -267,7 +267,6 @@ struct pf_pdesc {
        } hdr;
 };
 
-extern struct timeout  pf_purge_states_to;
 extern struct task     pf_purge_task;
 extern struct timeout  pf_purge_to;
 
@@ -319,6 +318,9 @@ extern struct rwlock        pf_state_lock;
                        splassert_fail(RW_WRITE,        \
                            rw_status(&pf_state_lock), __func__);\
        } while (0)
+
+extern void                     pf_purge_timeout(void *);
+extern void                     pf_purge(void *);
 
 /* for copies to/from network byte order */
 void                   pf_state_peer_hton(const struct pf_state_peer *,

Reply via email to