The branch main has been updated by tuexen:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=bfdd5b643d23171c53920accc2f15f78e984dfae

commit bfdd5b643d23171c53920accc2f15f78e984dfae
Author:     Nick Banks <[email protected]>
AuthorDate: 2025-10-13 20:31:30 +0000
Commit:     Michael Tuexen <[email protected]>
CommitDate: 2025-10-13 20:31:30 +0000

    tcp: improve testing of HPTS
    
    Improve the HPTS API to allow testing and add several tests.
    
    Reviewed by:    tuexen
    Sponsored by:   Netflix, Inc.
---
 sys/conf/NOTES              |    1 +
 sys/conf/options            |    1 +
 sys/netinet/tcp_hpts.c      |  758 +++++++++++---------
 sys/netinet/tcp_hpts.h      |   30 +-
 sys/netinet/tcp_hpts_test.c | 1616 ++++++++++++++++++++++++++++++++++++++++++-
 sys/netinet/tcp_lro_hpts.c  |    3 +
 sys/tests/ktest.h           |    7 +
 7 files changed, 2055 insertions(+), 361 deletions(-)

diff --git a/sys/conf/NOTES b/sys/conf/NOTES
index ea9b2667607e..a25ee8f6e1af 100644
--- a/sys/conf/NOTES
+++ b/sys/conf/NOTES
@@ -679,6 +679,7 @@ options     TCP_OFFLOAD             # TCP offload support.
 options        TCP_RFC7413             # TCP Fast Open
 
 options        TCPHPTS
+#options       TCP_HPTS_KTEST          # Add KTEST support for HPTS
 
 # In order to enable IPSEC you MUST also add device crypto to
 # your kernel configuration
diff --git a/sys/conf/options b/sys/conf/options
index b48ad1cf42cf..0b795a8d28fb 100644
--- a/sys/conf/options
+++ b/sys/conf/options
@@ -231,6 +231,7 @@ SYSVSEM             opt_sysvipc.h
 SYSVSHM                opt_sysvipc.h
 SW_WATCHDOG    opt_watchdog.h
 TCPHPTS
+TCP_HPTS_KTEST opt_inet.h
 TCP_REQUEST_TRK opt_global.h
 TCP_ACCOUNTING opt_global.h
 TCP_BBR                opt_inet.h
diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c
index 133703a5ede1..2631e79ab034 100644
--- a/sys/netinet/tcp_hpts.c
+++ b/sys/netinet/tcp_hpts.c
@@ -149,27 +149,44 @@
 #include <netinet/tcpip.h>
 #include <netinet/cc/cc.h>
 #include <netinet/tcp_hpts.h>
+#include <netinet/tcp_hpts_internal.h>
 #include <netinet/tcp_log_buf.h>
 
 #ifdef tcp_offload
 #include <netinet/tcp_offload.h>
 #endif
 
-/*
- * The hpts uses a 102400 wheel. The wheel
- * defines the time in 10 usec increments (102400 x 10).
- * This gives a range of 10usec - 1024ms to place
- * an entry within. If the user requests more than
- * 1.024 second, a remaineder is attached and the hpts
- * when seeing the remainder will re-insert the
- * inpcb forward in time from where it is until
- * the remainder is zero.
- */
+/* Global instance for TCP HPTS */
+struct tcp_hptsi *tcp_hptsi_pace;
+
+/* Default function table for production use. */
+const struct tcp_hptsi_funcs tcp_hptsi_default_funcs = {
+       .microuptime = microuptime,
+       .swi_add = swi_add,
+       .swi_remove = swi_remove,
+       .swi_sched = swi_sched,
+       .intr_event_bind = intr_event_bind,
+       .intr_event_bind_ithread_cpuset = intr_event_bind_ithread_cpuset,
+       .callout_init = callout_init,
+       .callout_reset_sbt_on = callout_reset_sbt_on,
+       ._callout_stop_safe = _callout_stop_safe,
+};
 
-#define NUM_OF_HPTSI_SLOTS 102400
+#ifdef TCP_HPTS_KTEST
+#define microuptime pace->funcs->microuptime
+#define swi_add pace->funcs->swi_add
+#define swi_remove pace->funcs->swi_remove
+#define swi_sched pace->funcs->swi_sched
+#define intr_event_bind pace->funcs->intr_event_bind
+#define intr_event_bind_ithread_cpuset 
pace->funcs->intr_event_bind_ithread_cpuset
+#define callout_init pace->funcs->callout_init
+#define callout_reset_sbt_on pace->funcs->callout_reset_sbt_on
+#define _callout_stop_safe pace->funcs->_callout_stop_safe
+#endif
 
-/* The number of connections after which the dynamic sleep logic kicks in. */
-#define DEFAULT_CONNECTION_THRESHOLD 100
+static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
+
+static void tcp_hpts_thread(void *ctx);
 
 /*
  * When using the hpts, a TCP stack must make sure
@@ -212,75 +229,14 @@
  *
  */
 
-/* Each hpts has its own p_mtx which is used for locking */
-#define        HPTS_MTX_ASSERT(hpts)   mtx_assert(&(hpts)->p_mtx, MA_OWNED)
-#define        HPTS_LOCK(hpts)         mtx_lock(&(hpts)->p_mtx)
-#define        HPTS_TRYLOCK(hpts)      mtx_trylock(&(hpts)->p_mtx)
-#define        HPTS_UNLOCK(hpts)       mtx_unlock(&(hpts)->p_mtx)
-struct tcp_hpts_entry {
-       /* Cache line 0x00 */
-       struct mtx p_mtx;       /* Mutex for hpts */
-       struct timeval p_mysleep;       /* Our min sleep time */
-       uint64_t syscall_cnt;
-       uint64_t sleeping;      /* What the actual sleep was (if sleeping) */
-       uint16_t p_hpts_active; /* Flag that says hpts is awake  */
-       uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */
-       uint32_t p_runningslot; /* Current slot we are at if we are running */
-       uint32_t p_prev_slot;   /* Previous slot we were on */
-       uint32_t p_cur_slot;    /* Current slot in wheel hpts is draining */
-       uint32_t p_nxt_slot;    /* The next slot outside the current range of
-                                * slots that the hpts is running on. */
-       int32_t p_on_queue_cnt; /* Count on queue in this hpts */
-       uint8_t p_direct_wake :1, /* boolean */
-               p_on_min_sleep:1, /* boolean */
-               p_hpts_wake_scheduled:1, /* boolean */
-               hit_callout_thresh:1,
-               p_avail:4;
-       uint8_t p_fill[3];        /* Fill to 32 bits */
-       /* Cache line 0x40 */
-       struct hptsh {
-               TAILQ_HEAD(, tcpcb)     head;
-               uint32_t                count;
-               uint32_t                gencnt;
-       } *p_hptss;                     /* Hptsi wheel */
-       uint32_t p_hpts_sleep_time;     /* Current sleep interval having a max
-                                        * of 255ms */
-       uint32_t overidden_sleep;       /* what was overrided by min-sleep for 
logging */
-       uint32_t saved_curslot;         /* for logging */
-       uint32_t saved_prev_slot;       /* for logging */
-       uint32_t p_delayed_by;  /* How much were we delayed by */
-       /* Cache line 0x80 */
-       struct sysctl_ctx_list hpts_ctx;
-       struct sysctl_oid *hpts_root;
-       struct intr_event *ie;
-       void *ie_cookie;
-       uint16_t p_num;         /* The hpts number one per cpu */
-       uint16_t p_cpu;         /* The hpts CPU */
-       /* There is extra space in here */
-       /* Cache line 0x100 */
-       struct callout co __aligned(CACHE_LINE_SIZE);
-}               __aligned(CACHE_LINE_SIZE);
-
-static struct tcp_hptsi {
-       struct cpu_group **grps;
-       struct tcp_hpts_entry **rp_ent; /* Array of hptss */
-       uint32_t *cts_last_ran;
-       uint32_t grp_cnt;
-       uint32_t rp_num_hptss;  /* Number of hpts threads */
-} tcp_pace;
-
-static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
 #ifdef RSS
-static int tcp_bind_threads = 1;
+int tcp_bind_threads = 1;
 #else
-static int tcp_bind_threads = 2;
+int tcp_bind_threads = 2;
 #endif
 static int tcp_use_irq_cpu = 0;
 static int hpts_does_tp_logging = 0;
-
-static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout);
-static void tcp_hpts_thread(void *ctx);
-
+static int32_t tcp_hpts_precision = 120;
 int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP;
 static int conn_cnt_thresh = DEFAULT_CONNECTION_THRESHOLD;
 static int32_t dynamic_min_sleep = DYNAMIC_MIN_SLEEP;
@@ -291,23 +247,6 @@ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW | 
CTLFLAG_MPSAFE, 0,
 SYSCTL_NODE(_net_inet_tcp_hpts, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_MPSAFE, 
0,
     "TCP Hpts statistics");
 
-#define        timersub(tvp, uvp, vvp)                                         
\
-       do {                                                            \
-               (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;          \
-               (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;       \
-               if ((vvp)->tv_usec < 0) {                               \
-                       (vvp)->tv_sec--;                                \
-                       (vvp)->tv_usec += 1000000;                      \
-               }                                                       \
-       } while (0)
-
-static int32_t tcp_hpts_precision = 120;
-
-static struct hpts_domain_info {
-       int count;
-       int cpu[MAXCPU];
-} hpts_domains[MAXMEMDOM];
-
 counter_u64_t hpts_hopelessly_behind;
 
 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, hopeless, CTLFLAG_RD,
@@ -455,14 +394,14 @@ SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, 
nowake_over_thresh, CTLFLAG_RW,
     &tcp_hpts_no_wake_over_thresh, 0,
     "When we are over the threshold on the pacer do we prohibit wakeups?");
 
-static uint16_t
-hpts_random_cpu(void)
+uint16_t
+tcp_hptsi_random_cpu(struct tcp_hptsi *pace)
 {
        uint16_t cpuid;
        uint32_t ran;
 
        ran = arc4random();
-       cpuid = (((ran & 0xffff) % mp_ncpus) % tcp_pace.rp_num_hptss);
+       cpuid = (((ran & 0xffff) % mp_ncpus) % pace->rp_num_hptss);
        return (cpuid);
 }
 
@@ -504,11 +443,67 @@ tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb 
*tp, struct timeval *tv,
        }
 }
 
+/*
+ * Timeout handler for the HPTS sleep callout. It immediately schedules the SWI
+ * for the HPTS entry to run.
+ */
 static void
-tcp_wakehpts(struct tcp_hpts_entry *hpts)
+tcp_hpts_sleep_timeout(void *arg)
 {
+#ifdef TCP_HPTS_KTEST
+       struct tcp_hptsi *pace;
+#endif
+       struct tcp_hpts_entry *hpts;
+
+       hpts = (struct tcp_hpts_entry *)arg;
+#ifdef TCP_HPTS_KTEST
+       pace = hpts->p_hptsi;
+#endif
+       swi_sched(hpts->ie_cookie, 0);
+}
+
+/*
+ * Reset the HPTS callout timer with the provided timeval. Returns the results
+ * of the callout_reset_sbt_on() function.
+ */
+static int
+tcp_hpts_sleep(struct tcp_hpts_entry *hpts, struct timeval *tv)
+{
+#ifdef TCP_HPTS_KTEST
+       struct tcp_hptsi *pace;
+#endif
+       sbintime_t sb;
+
+#ifdef TCP_HPTS_KTEST
+       pace = hpts->p_hptsi;
+#endif
+
+       /* Store off to make visible the actual sleep time */
+       hpts->sleeping = tv->tv_usec;
+
+       sb = tvtosbt(*tv);
+       return (callout_reset_sbt_on(
+                   &hpts->co, sb, 0, tcp_hpts_sleep_timeout, hpts, hpts->p_cpu,
+                   (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))));
+}
+
+/*
+ * Schedules the SWI for the HTPS entry to run, if not already scheduled or
+ * running.
+ */
+void
+tcp_hpts_wake(struct tcp_hpts_entry *hpts)
+{
+#ifdef TCP_HPTS_KTEST
+       struct tcp_hptsi *pace;
+#endif
+
        HPTS_MTX_ASSERT(hpts);
 
+#ifdef TCP_HPTS_KTEST
+       pace = hpts->p_hptsi;
+#endif
+
        if (tcp_hpts_no_wake_over_thresh && (hpts->p_on_queue_cnt >= 
conn_cnt_thresh)) {
                hpts->p_direct_wake = 0;
                return;
@@ -519,15 +514,6 @@ tcp_wakehpts(struct tcp_hpts_entry *hpts)
        }
 }
 
-static void
-hpts_timeout_swi(void *arg)
-{
-       struct tcp_hpts_entry *hpts;
-
-       hpts = (struct tcp_hpts_entry *)arg;
-       swi_sched(hpts->ie_cookie, 0);
-}
-
 static void
 tcp_hpts_insert_internal(struct tcpcb *tp, struct tcp_hpts_entry *hpts)
 {
@@ -556,13 +542,13 @@ tcp_hpts_insert_internal(struct tcpcb *tp, struct 
tcp_hpts_entry *hpts)
 }
 
 static struct tcp_hpts_entry *
-tcp_hpts_lock(struct tcpcb *tp)
+tcp_hpts_lock(struct tcp_hptsi *pace, struct tcpcb *tp)
 {
        struct tcp_hpts_entry *hpts;
 
        INP_LOCK_ASSERT(tptoinpcb(tp));
 
-       hpts = tcp_pace.rp_ent[tp->t_hpts_cpu];
+       hpts = pace->rp_ent[tp->t_hpts_cpu];
        HPTS_LOCK(hpts);
 
        return (hpts);
@@ -589,11 +575,10 @@ tcp_hpts_release(struct tcpcb *tp)
  * and has never received a first packet.
  */
 void
-tcp_hpts_init(struct tcpcb *tp)
+__tcp_hpts_init(struct tcp_hptsi *pace, struct tcpcb *tp)
 {
-
        if (__predict_true(tp->t_hpts_cpu == HPTS_CPU_NONE)) {
-               tp->t_hpts_cpu = hpts_random_cpu();
+               tp->t_hpts_cpu = tcp_hptsi_random_cpu(pace);
                MPASS(!(tp->t_flags2 & TF2_HPTS_CPU_SET));
        }
 }
@@ -605,14 +590,14 @@ tcp_hpts_init(struct tcpcb *tp)
  * INP lock and then get the hpts lock.
  */
 void
-tcp_hpts_remove(struct tcpcb *tp)
+__tcp_hpts_remove(struct tcp_hptsi *pace, struct tcpcb *tp)
 {
        struct tcp_hpts_entry *hpts;
        struct hptsh *hptsh;
 
        INP_WLOCK_ASSERT(tptoinpcb(tp));
 
-       hpts = tcp_hpts_lock(tp);
+       hpts = tcp_hpts_lock(pace, tp);
        if (tp->t_in_hpts == IHPTS_ONQUEUE) {
                hptsh = &hpts->p_hptss[tp->t_hpts_slot];
                tp->t_hpts_request = 0;
@@ -847,10 +832,11 @@ check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, 
struct tcpcb *tp,
 
 void
 #ifdef INVARIANTS
-__tcp_hpts_insert(struct tcpcb *tp, uint32_t slot, int32_t line,
-       struct hpts_diag *diag)
+__tcp_hpts_insert(struct tcp_hptsi *pace, struct tcpcb *tp, uint32_t slot,
+       int32_t line, struct hpts_diag *diag)
 #else
-tcp_hpts_insert(struct tcpcb *tp, uint32_t slot, struct hpts_diag *diag)
+__tcp_hpts_insert(struct tcp_hptsi *pace, struct tcpcb *tp, uint32_t slot,
+       struct hpts_diag *diag)
 #endif
 {
        struct tcp_hpts_entry *hpts;
@@ -868,7 +854,7 @@ tcp_hpts_insert(struct tcpcb *tp, uint32_t slot, struct 
hpts_diag *diag)
         * current run (if up) or where it was when it stopped if it is
         * sleeping.
         */
-       hpts = tcp_hpts_lock(tp);
+       hpts = tcp_hpts_lock(pace, tp);
        microuptime(&tv);
        if (diag) {
                memset(diag, 0, sizeof(struct hpts_diag));
@@ -903,7 +889,7 @@ tcp_hpts_insert(struct tcpcb *tp, uint32_t slot, struct 
hpts_diag *diag)
                         * timeout is not 1.
                         */
                        hpts->p_direct_wake = 1;
-                       tcp_wakehpts(hpts);
+                       tcp_hpts_wake(hpts);
                }
                HPTS_UNLOCK(hpts);
 
@@ -991,7 +977,7 @@ tcp_hpts_insert(struct tcpcb *tp, uint32_t slot, struct 
hpts_diag *diag)
         */
        if (need_wakeup) {
                hpts->p_direct_wake = 1;
-               tcp_wakehpts(hpts);
+               tcp_hpts_wake(hpts);
                if (diag) {
                        diag->need_new_to = 0;
                        diag->co_ret = 0xffff0000;
@@ -999,7 +985,6 @@ tcp_hpts_insert(struct tcpcb *tp, uint32_t slot, struct 
hpts_diag *diag)
        } else if (need_new_to) {
                int32_t co_ret;
                struct timeval tv;
-               sbintime_t sb;
 
                tv.tv_sec = 0;
                tv.tv_usec = 0;
@@ -1007,11 +992,8 @@ tcp_hpts_insert(struct tcpcb *tp, uint32_t slot, struct 
hpts_diag *diag)
                        tv.tv_sec++;
                        need_new_to -= HPTS_USEC_IN_SEC;
                }
-               tv.tv_usec = need_new_to;
-               sb = tvtosbt(tv);
-               co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
-                                             hpts_timeout_swi, hpts, 
hpts->p_cpu,
-                                             (C_DIRECT_EXEC | 
C_PREL(tcp_hpts_precision)));
+               tv.tv_usec = need_new_to; /* XXX: Why is this sleeping over the 
max? */
+               co_ret = tcp_hpts_sleep(hpts, &tv);
                if (diag) {
                        diag->need_new_to = need_new_to;
                        diag->co_ret = co_ret;
@@ -1021,7 +1003,7 @@ tcp_hpts_insert(struct tcpcb *tp, uint32_t slot, struct 
hpts_diag *diag)
 }
 
 static uint16_t
-hpts_cpuid(struct tcpcb *tp, int *failed)
+hpts_cpuid(struct tcp_hptsi *pace, struct tcpcb *tp, int *failed)
 {
        struct inpcb *inp = tptoinpcb(tp);
        u_int cpuid;
@@ -1048,7 +1030,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed)
 #ifdef RSS
        cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
        if (cpuid == NETISR_CPUID_NONE)
-               return (hpts_random_cpu());
+               return (tcp_hptsi_random_cpu(pace));
        else
                return (cpuid);
 #endif
@@ -1059,7 +1041,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed)
         */
        if (inp->inp_flowtype == M_HASHTYPE_NONE) {
                counter_u64_add(cpu_uses_random, 1);
-               return (hpts_random_cpu());
+               return (tcp_hptsi_random_cpu(pace));
        }
        /*
         * Hash to a thread based on the flowid.  If we are using numa,
@@ -1074,7 +1056,7 @@ hpts_cpuid(struct tcpcb *tp, int *failed)
 #ifdef NUMA
        } else {
                /* Hash into the cpu's that use that domain */
-               di = &hpts_domains[inp->inp_numa_domain];
+               di = &pace->domains[inp->inp_numa_domain];
                cpuid = di->cpu[inp->inp_flowid % di->count];
        }
 #endif
@@ -1112,9 +1094,10 @@ tcp_hpts_different_slots(uint32_t cts, uint32_t 
cts_last_run)
        return ((cts / HPTS_USECS_PER_SLOT) != (cts_last_run / 
HPTS_USECS_PER_SLOT));
 }
 
-static int32_t
+int32_t
 tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout)
 {
+       struct tcp_hptsi *pace;
        struct tcpcb *tp;
        struct timeval tv;
        int32_t slots_to_run, i, error;
@@ -1132,12 +1115,17 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, bool 
from_callout)
 
        HPTS_MTX_ASSERT(hpts);
        NET_EPOCH_ASSERT();
+
+       pace = hpts->p_hptsi;
+       MPASS(pace != NULL);
+
        /* record previous info for any logging */
        hpts->saved_curslot = hpts->p_cur_slot;
        hpts->saved_prev_slot = hpts->p_prev_slot;
 
-       cts_last_run = tcp_pace.cts_last_ran[hpts->p_num];
-       tcp_pace.cts_last_ran[hpts->p_num] = cts = tcp_get_usecs(&tv);
+       microuptime(&tv);
+       cts_last_run = pace->cts_last_ran[hpts->p_cpu];
+       pace->cts_last_ran[hpts->p_cpu] = cts = tcp_tv_to_usec(&tv);
 
        orig_exit_slot = hpts->p_cur_slot = cts_to_wheel(cts);
        if ((hpts->p_on_queue_cnt == 0) ||
@@ -1383,7 +1371,7 @@ again:
                                 * gets added to the hpts (not this one)
                                 * :-)
                                 */
-                               tcp_set_hpts(tp);
+                               __tcp_set_hpts(pace, tp);
                        }
                        CURVNET_SET(inp->inp_vnet);
                        /* Lets do any logging that we might want to */
@@ -1445,8 +1433,9 @@ no_one:
         * more slots (if we did not hit eno-bufs).
         */
        hpts->p_prev_slot = hpts->p_cur_slot;
+       microuptime(&tv);
        cts_last_run = cts;
-       cts = tcp_get_usecs(&tv);
+       cts = tcp_tv_to_usec(&tv);
        if (!from_callout || (loop_cnt > max_pacer_loops)) {
                /*
                 * Something is serious slow we have
@@ -1478,7 +1467,7 @@ no_one:
                goto again;
        }
 no_run:
-       tcp_pace.cts_last_ran[hpts->p_num] = cts;
+       pace->cts_last_ran[hpts->p_cpu] = cts;
        /*
         * Set flag to tell that we are done for
         * any slot input that happens during
@@ -1512,7 +1501,8 @@ no_run:
                 cts_last_run, cts, loop_cnt, wrap_loop_cnt));
 
        if (from_callout && tcp_hpts_different_slots(cts, cts_last_run)) {
-               cts = tcp_get_usecs(&tv);
+               microuptime(&tv);
+               cts = tcp_tv_to_usec(&tv);
                hpts->p_cur_slot = cts_to_wheel(cts);
                counter_u64_add(hpts_loops, 1);
                goto again;
@@ -1528,16 +1518,16 @@ no_run:
 }
 
 void
-tcp_set_hpts(struct tcpcb *tp)
+__tcp_set_hpts(struct tcp_hptsi *pace, struct tcpcb *tp)
 {
        struct tcp_hpts_entry *hpts;
        int failed;
 
        INP_WLOCK_ASSERT(tptoinpcb(tp));
 
-       hpts = tcp_hpts_lock(tp);
+       hpts = tcp_hpts_lock(pace, tp);
        if (tp->t_in_hpts == IHPTS_NONE && !(tp->t_flags2 & TF2_HPTS_CPU_SET)) {
-               tp->t_hpts_cpu = hpts_cpuid(tp, &failed);
+               tp->t_hpts_cpu = hpts_cpuid(pace, tp, &failed);
                if (failed == 0)
                        tp->t_flags2 |= TF2_HPTS_CPU_SET;
        }
@@ -1545,33 +1535,35 @@ tcp_set_hpts(struct tcpcb *tp)
 }
 
 static struct tcp_hpts_entry *
-tcp_choose_hpts_to_run(void)
+tcp_choose_hpts_to_run(struct tcp_hptsi *pace)
 {
+       struct timeval tv;
        int i, oldest_idx, start, end;
        uint32_t cts, time_since_ran, calc;
 
-       cts = tcp_get_usecs(NULL);
+       microuptime(&tv);
+       cts = tcp_tv_to_usec(&tv);
        time_since_ran = 0;
        /* Default is all one group */
        start = 0;
-       end = tcp_pace.rp_num_hptss;
+       end = pace->rp_num_hptss;
        /*
         * If we have more than one L3 group figure out which one
         * this CPU is in.
         */
-       if (tcp_pace.grp_cnt > 1) {
-               for (i = 0; i < tcp_pace.grp_cnt; i++) {
-                       if (CPU_ISSET(curcpu, &tcp_pace.grps[i]->cg_mask)) {
-                               start = tcp_pace.grps[i]->cg_first;
-                               end = (tcp_pace.grps[i]->cg_last + 1);
+       if (pace->grp_cnt > 1) {
+               for (i = 0; i < pace->grp_cnt; i++) {
+                       if (CPU_ISSET(curcpu, &pace->grps[i]->cg_mask)) {
+                               start = pace->grps[i]->cg_first;
+                               end = (pace->grps[i]->cg_last + 1);
                                break;
                        }
                }
        }
        oldest_idx = -1;
        for (i = start; i < end; i++) {
-               if (TSTMP_GT(cts, tcp_pace.cts_last_ran[i]))
-                       calc = cts - tcp_pace.cts_last_ran[i];
+               if (TSTMP_GT(cts, pace->cts_last_ran[i]))
+                       calc = cts - pace->cts_last_ran[i];
                else
                        calc = 0;
                if (calc > time_since_ran) {
@@ -1580,9 +1572,9 @@ tcp_choose_hpts_to_run(void)
                }
        }
        if (oldest_idx >= 0)
-               return(tcp_pace.rp_ent[oldest_idx]);
+               return(pace->rp_ent[oldest_idx]);
        else
-               return(tcp_pace.rp_ent[(curcpu % tcp_pace.rp_num_hptss)]);
+               return(pace->rp_ent[(curcpu % pace->rp_num_hptss)]);
 }
 
 static void
@@ -1592,7 +1584,7 @@ __tcp_run_hpts(void)
        struct tcp_hpts_entry *hpts;
        int slots_ran;
 
-       hpts = tcp_choose_hpts_to_run();
+       hpts = tcp_choose_hpts_to_run(tcp_hptsi_pace);
 
        if (hpts->p_hpts_active) {
                /* Already active */
@@ -1613,7 +1605,6 @@ __tcp_run_hpts(void)
        if (hpts->p_on_queue_cnt >= conn_cnt_thresh) {
                if (slots_ran > slots_indicate_less_sleep) {
                        struct timeval tv;
-                       sbintime_t sb;
 
                        hpts->p_mysleep.tv_usec /= 2;
                        if (hpts->p_mysleep.tv_usec < dynamic_min_sleep)
@@ -1637,12 +1628,7 @@ __tcp_run_hpts(void)
                         * the dynamic value and set the on_min_sleep
                         * flag so we will not be awoken.
                         */
-                       sb = tvtosbt(tv);
-                       /* Store off to make visible the actual sleep time */
-                       hpts->sleeping = tv.tv_usec;
-                       callout_reset_sbt_on(&hpts->co, sb, 0,
-                                            hpts_timeout_swi, hpts, 
hpts->p_cpu,
-                                            (C_DIRECT_EXEC | 
C_PREL(tcp_hpts_precision)));
+                       (void)tcp_hpts_sleep(hpts, &tv);
                } else if (slots_ran < slots_indicate_more_sleep) {
                        /* For the further sleep, don't reschedule  hpts */
                        hpts->p_mysleep.tv_usec *= 2;
@@ -1660,17 +1646,22 @@ out_with_mtx:
 static void
 tcp_hpts_thread(void *ctx)
 {
+#ifdef TCP_HPTS_KTEST
+       struct tcp_hptsi *pace;
+#endif
        struct tcp_hpts_entry *hpts;
        struct epoch_tracker et;
        struct timeval tv;
-       sbintime_t sb;
        int slots_ran;
 
        hpts = (struct tcp_hpts_entry *)ctx;
+#ifdef TCP_HPTS_KTEST
+       pace = hpts->p_hptsi;
+#endif
        HPTS_LOCK(hpts);
        if (hpts->p_direct_wake) {
                /* Signaled by input or output with low occupancy count. */
-               callout_stop(&hpts->co);
+               _callout_stop_safe(&hpts->co, 0);
                counter_u64_add(hpts_direct_awakening, 1);
        } else {
                /* Timed out, the normal case. */
@@ -1799,18 +1790,11 @@ tcp_hpts_thread(void *ctx)
        hpts->p_hpts_active = 0;
 back_to_sleep:
        hpts->p_direct_wake = 0;
-       sb = tvtosbt(tv);
-       /* Store off to make visible the actual sleep time */
-       hpts->sleeping = tv.tv_usec;
-       callout_reset_sbt_on(&hpts->co, sb, 0,
-                            hpts_timeout_swi, hpts, hpts->p_cpu,
-                            (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+       (void)tcp_hpts_sleep(hpts, &tv);
        NET_EPOCH_EXIT(et);
        HPTS_UNLOCK(hpts);
 }
 
-#undef timersub
-
 static int32_t
 hpts_count_level(struct cpu_group *cg)
 {
@@ -1847,57 +1831,63 @@ hpts_gather_grps(struct cpu_group **grps, int32_t *at, 
int32_t max, struct cpu_g
        }
 }
 
-static void
-tcp_hpts_mod_load(void)
+/*
+ * Initialize a tcp_hptsi structure. This performs the core initialization
+ * without starting threads.
+ */
+struct tcp_hptsi*
+tcp_hptsi_create(const struct tcp_hptsi_funcs *funcs, bool enable_sysctl)
 {
+       struct tcp_hptsi *pace;
        struct cpu_group *cpu_top;
-       int32_t error __diagused;
-       int32_t i, j, bound = 0, created = 0;
+       uint32_t i, j, cts;
+       int32_t count;
        size_t sz, asz;
        struct timeval tv;
-       sbintime_t sb;
        struct tcp_hpts_entry *hpts;
-       struct pcpu *pc;
        char unit[16];
        uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
-       int count, domain;
 
+       KASSERT(funcs != NULL, ("funcs is NULL"));
+
+       /* Allocate the main structure */
+       pace = malloc(sizeof(struct tcp_hptsi), M_TCPHPTS, M_WAITOK | M_ZERO);
+       if (pace == NULL)
+               return (NULL);
+
+       memset(pace, 0, sizeof(*pace));
+       pace->funcs = funcs;
+
+       /* Setup CPU topology information */
 #ifdef SMP
        cpu_top = smp_topo();
 #else
        cpu_top = NULL;
 #endif
-       tcp_pace.rp_num_hptss = ncpus;
-       hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK);
-       hpts_loops = counter_u64_alloc(M_WAITOK);
-       back_tosleep = counter_u64_alloc(M_WAITOK);
-       combined_wheel_wrap = counter_u64_alloc(M_WAITOK);
-       wheel_wrap = counter_u64_alloc(M_WAITOK);
-       hpts_wake_timeout = counter_u64_alloc(M_WAITOK);
-       hpts_direct_awakening = counter_u64_alloc(M_WAITOK);
-       hpts_back_tosleep = counter_u64_alloc(M_WAITOK);
-       hpts_direct_call = counter_u64_alloc(M_WAITOK);
-       cpu_uses_flowid = counter_u64_alloc(M_WAITOK);
-       cpu_uses_random = counter_u64_alloc(M_WAITOK);
+       pace->rp_num_hptss = ncpus;
 
-       sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *));
-       tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
-       sz = (sizeof(uint32_t) * tcp_pace.rp_num_hptss);
-       tcp_pace.cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK);
-       tcp_pace.grp_cnt = 0;
+       /* Allocate hpts entry array */
+       sz = (pace->rp_num_hptss * sizeof(struct tcp_hpts_entry *));
+       pace->rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
+
+       /* Allocate timestamp tracking array */
+       sz = (sizeof(uint32_t) * pace->rp_num_hptss);
+       pace->cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK);
+
+       /* Setup CPU groups */
        if (cpu_top == NULL) {
-               tcp_pace.grp_cnt = 1;
+               pace->grp_cnt = 1;
        } else {
                /* Find out how many cache level 3 domains we have */
                count = 0;
-               tcp_pace.grp_cnt = hpts_count_level(cpu_top);
-               if (tcp_pace.grp_cnt == 0) {
-                       tcp_pace.grp_cnt = 1;
+               pace->grp_cnt = hpts_count_level(cpu_top);
+               if (pace->grp_cnt == 0) {
+                       pace->grp_cnt = 1;
                }
-               sz = (tcp_pace.grp_cnt * sizeof(struct cpu_group *));
-               tcp_pace.grps = malloc(sz, M_TCPHPTS, M_WAITOK);
+               sz = (pace->grp_cnt * sizeof(struct cpu_group *));
+               pace->grps = malloc(sz, M_TCPHPTS, M_WAITOK);
                /* Now populate the groups */
-               if (tcp_pace.grp_cnt == 1) {
+               if (pace->grp_cnt == 1) {
                        /*
                         * All we need is the top level all cpu's are in
                         * the same cache so when we use grp[0]->cg_mask
@@ -1905,188 +1895,290 @@ tcp_hpts_mod_load(void)
                         * all cpu's in it. The level here is probably
                         * zero which is ok.
                         */
-                       tcp_pace.grps[0] = cpu_top;
+                       pace->grps[0] = cpu_top;
                } else {
                        /*
                         * Here we must find all the level three cache domains
                         * and setup our pointers to them.
                         */
                        count = 0;
-                       hpts_gather_grps(tcp_pace.grps, &count, 
tcp_pace.grp_cnt, cpu_top);
+                       hpts_gather_grps(pace->grps, &count, pace->grp_cnt, 
cpu_top);
                }
        }
+
+       /* Cache the current time for initializing the hpts entries */
+       microuptime(&tv);
+       cts = tcp_tv_to_usec(&tv);
+
+       /* Initialize each hpts entry */
        asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS;
-       for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
-               tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry),
+       for (i = 0; i < pace->rp_num_hptss; i++) {
+               pace->rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry),
                    M_TCPHPTS, M_WAITOK | M_ZERO);
-               tcp_pace.rp_ent[i]->p_hptss = malloc(asz, M_TCPHPTS, M_WAITOK);
-               hpts = tcp_pace.rp_ent[i];
-               /*
-                * Init all the hpts structures that are not specifically
-                * zero'd by the allocations. Also lets attach them to the
-                * appropriate sysctl block as well.
-                */
-               mtx_init(&hpts->p_mtx, "tcp_hpts_lck",
-                   "hpts", MTX_DEF | MTX_DUPOK);
-               for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) {
-                       TAILQ_INIT(&hpts->p_hptss[j].head);
-                       hpts->p_hptss[j].count = 0;
-                       hpts->p_hptss[j].gencnt = 0;
-               }
-               sysctl_ctx_init(&hpts->hpts_ctx);
-               sprintf(unit, "%d", i);
-               hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx,
-                   SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts),
-                   OID_AUTO,
-                   unit,
-                   CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
-                   "");
-               SYSCTL_ADD_INT(&hpts->hpts_ctx,
-                   SYSCTL_CHILDREN(hpts->hpts_root),
-                   OID_AUTO, "out_qcnt", CTLFLAG_RD,
-                   &hpts->p_on_queue_cnt, 0,
-                   "Count TCB's awaiting output processing");
-               SYSCTL_ADD_U16(&hpts->hpts_ctx,
-                   SYSCTL_CHILDREN(hpts->hpts_root),
-                   OID_AUTO, "active", CTLFLAG_RD,
-                   &hpts->p_hpts_active, 0,
-                   "Is the hpts active");
-               SYSCTL_ADD_UINT(&hpts->hpts_ctx,
-                   SYSCTL_CHILDREN(hpts->hpts_root),
-                   OID_AUTO, "curslot", CTLFLAG_RD,
-                   &hpts->p_cur_slot, 0,
-                   "What the current running pacers goal");
-               SYSCTL_ADD_UINT(&hpts->hpts_ctx,
-                   SYSCTL_CHILDREN(hpts->hpts_root),
-                   OID_AUTO, "runslot", CTLFLAG_RD,
-                   &hpts->p_runningslot, 0,
-                   "What the running pacers current slot is");
-               SYSCTL_ADD_UINT(&hpts->hpts_ctx,
-                   SYSCTL_CHILDREN(hpts->hpts_root),
-                   OID_AUTO, "lastran", CTLFLAG_RD,
-                   &tcp_pace.cts_last_ran[i], 0,
-                   "The last usec timestamp that this hpts ran");
-               SYSCTL_ADD_LONG(&hpts->hpts_ctx,
-                   SYSCTL_CHILDREN(hpts->hpts_root),
-                   OID_AUTO, "cur_min_sleep", CTLFLAG_RD,
-                   &hpts->p_mysleep.tv_usec,
-                   "What the running pacers is using for p_mysleep.tv_usec");
-               SYSCTL_ADD_U64(&hpts->hpts_ctx,
-                   SYSCTL_CHILDREN(hpts->hpts_root),
-                   OID_AUTO, "now_sleeping", CTLFLAG_RD,
-                   &hpts->sleeping, 0,
-                   "What the running pacers is actually sleeping for");
-               SYSCTL_ADD_U64(&hpts->hpts_ctx,
-                   SYSCTL_CHILDREN(hpts->hpts_root),
-                   OID_AUTO, "syscall_cnt", CTLFLAG_RD,
-                   &hpts->syscall_cnt, 0,
-                   "How many times we had syscalls on this hpts");
+               pace->rp_ent[i]->p_hptss = malloc(asz, M_TCPHPTS,
+                   M_WAITOK | M_ZERO);
+               hpts = pace->rp_ent[i];
 
+               /* Basic initialization */
                hpts->p_hpts_sleep_time = hpts_sleep_max;
-               hpts->p_num = i;
-               tcp_pace.cts_last_ran[i] = tcp_get_u64_usecs(&tv);
-               hpts->p_cur_slot = cts_to_wheel(tcp_pace.cts_last_ran[i]);
+               hpts->p_cpu = i;
+               pace->cts_last_ran[i] = cts;
+               hpts->p_cur_slot = cts_to_wheel(cts);
                hpts->p_prev_slot = hpts->p_cur_slot;
-               hpts->p_cpu = 0xffff;
                hpts->p_nxt_slot = hpts_slot(hpts->p_cur_slot, 1);
                callout_init(&hpts->co, 1);
+               hpts->p_hptsi = pace;
+               mtx_init(&hpts->p_mtx, "tcp_hpts_lck", "hpts",
+                   MTX_DEF | MTX_DUPOK);
+               for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) {
+                       TAILQ_INIT(&hpts->p_hptss[j].head);
+               }
+
+               /* Setup SYSCTL if requested */
+               if (enable_sysctl) {
+                       sysctl_ctx_init(&hpts->hpts_ctx);
+                       sprintf(unit, "%d", i);
+                       hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx,
+                           SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts),
+                           OID_AUTO,
+                           unit,
+                           CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+                           "");
+                       SYSCTL_ADD_INT(&hpts->hpts_ctx,
+                           SYSCTL_CHILDREN(hpts->hpts_root),
+                           OID_AUTO, "out_qcnt", CTLFLAG_RD,
+                           &hpts->p_on_queue_cnt, 0,
+                           "Count TCB's awaiting output processing");
+                       SYSCTL_ADD_U16(&hpts->hpts_ctx,
+                           SYSCTL_CHILDREN(hpts->hpts_root),
+                           OID_AUTO, "active", CTLFLAG_RD,
+                           &hpts->p_hpts_active, 0,
+                           "Is the hpts active");
+                       SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+                           SYSCTL_CHILDREN(hpts->hpts_root),
+                           OID_AUTO, "curslot", CTLFLAG_RD,
+                           &hpts->p_cur_slot, 0,
+                           "What the current running pacers goal");
+                       SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+                           SYSCTL_CHILDREN(hpts->hpts_root),
+                           OID_AUTO, "runslot", CTLFLAG_RD,
+                           &hpts->p_runningslot, 0,
+                           "What the running pacers current slot is");
+                       SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+                           SYSCTL_CHILDREN(hpts->hpts_root),
+                           OID_AUTO, "lastran", CTLFLAG_RD,
+                           &pace->cts_last_ran[i], 0,
+                           "The last usec timestamp that this hpts ran");
+                       SYSCTL_ADD_LONG(&hpts->hpts_ctx,
+                           SYSCTL_CHILDREN(hpts->hpts_root),
+                           OID_AUTO, "cur_min_sleep", CTLFLAG_RD,
+                           &hpts->p_mysleep.tv_usec,
+                           "What the running pacers is using for 
p_mysleep.tv_usec");
+                       SYSCTL_ADD_U64(&hpts->hpts_ctx,
+                           SYSCTL_CHILDREN(hpts->hpts_root),
+                           OID_AUTO, "now_sleeping", CTLFLAG_RD,
+                           &hpts->sleeping, 0,
+                           "What the running pacers is actually sleeping for");
+                       SYSCTL_ADD_U64(&hpts->hpts_ctx,
+                           SYSCTL_CHILDREN(hpts->hpts_root),
+                           OID_AUTO, "syscall_cnt", CTLFLAG_RD,
+                           &hpts->syscall_cnt, 0,
+                           "How many times we had syscalls on this hpts");
+               }
        }
-       /* Don't try to bind to NUMA domains if we don't have any */
-       if (vm_ndomains == 1 && tcp_bind_threads == 2)
-               tcp_bind_threads = 0;
 
-       /*
-        * Now lets start ithreads to handle the hptss.
-        */
-       for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
-               hpts = tcp_pace.rp_ent[i];
-               hpts->p_cpu = i;
+       return (pace);
+}
+
+/*
+ * Create threads for a tcp_hptsi structure and starts timers for the current
+ * (minimum) sleep interval.
+ */
+void
+tcp_hptsi_start(struct tcp_hptsi *pace)
+{
+       struct tcp_hpts_entry *hpts;
+       struct pcpu *pc;
+       struct timeval tv;
+       uint32_t i, j;
+       int count, domain;
+       int error __diagused;
+
+       KASSERT(pace != NULL, ("tcp_hptsi_start: pace is NULL"));
+
+       /* Start threads for each hpts entry */
+       for (i = 0; i < pace->rp_num_hptss; i++) {
+               hpts = pace->rp_ent[i];
+
+               KASSERT(hpts->ie_cookie == NULL,
+                   ("tcp_hptsi_start: hpts[%d]->ie_cookie is not NULL", i));
 
                error = swi_add(&hpts->ie, "hpts",
*** 2014 LINES SKIPPED ***

Reply via email to