Author: rrs
Date: Thu Aug  1 14:17:31 2019
New Revision: 350501
URL: https://svnweb.freebsd.org/changeset/base/350501

Log:
  This adds the third step in getting BBR into the tree. BBR and
  an updated rack depend on having access to the new
  ratelimit api in this commit.
  
  Sponsored by: Netflix Inc.
  Differential Revision:        https://reviews.freebsd.org/D20953

Added:
  head/sys/netinet/tcp_ratelimit.c   (contents, props changed)
  head/sys/netinet/tcp_ratelimit.h   (contents, props changed)
Modified:
  head/sys/conf/files
  head/sys/dev/cxgbe/adapter.h
  head/sys/dev/cxgbe/t4_main.c
  head/sys/dev/cxgbe/t4_sched.c
  head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
  head/sys/net/if_dead.c
  head/sys/net/if_lagg.c
  head/sys/net/if_var.h
  head/sys/netinet/in_pcb.c
  head/sys/netinet/in_pcb.h

Modified: head/sys/conf/files
==============================================================================
--- head/sys/conf/files Thu Aug  1 14:13:04 2019        (r350500)
+++ head/sys/conf/files Thu Aug  1 14:17:31 2019        (r350501)
@@ -4276,6 +4276,7 @@ netinet/tcp_lro.c         optional inet | inet6
 netinet/tcp_output.c           optional inet | inet6
 netinet/tcp_offload.c          optional tcp_offload inet | tcp_offload inet6
 netinet/tcp_hpts.c              optional tcphpts inet | tcphpts inet6
+netinet/tcp_ratelimit.c         optional ratelimit inet | ratelimit inet6
 netinet/tcp_pcap.c             optional inet tcppcap | inet6 tcppcap \
        compile-with "${NORMAL_C} ${NO_WNONNULL}"
 netinet/tcp_reass.c            optional inet | inet6

Modified: head/sys/dev/cxgbe/adapter.h
==============================================================================
--- head/sys/dev/cxgbe/adapter.h        Thu Aug  1 14:13:04 2019        
(r350500)
+++ head/sys/dev/cxgbe/adapter.h        Thu Aug  1 14:17:31 2019        
(r350501)
@@ -1247,6 +1247,7 @@ int cxgbe_snd_tag_modify(struct m_snd_tag *, union if_
 int cxgbe_snd_tag_query(struct m_snd_tag *, union if_snd_tag_query_params *);
 void cxgbe_snd_tag_free(struct m_snd_tag *);
 void cxgbe_snd_tag_free_locked(struct cxgbe_snd_tag *);
+void cxgbe_ratelimit_query(struct ifnet *, struct if_ratelimit_query_results 
*);
 #endif
 
 /* t4_filter.c */

Modified: head/sys/dev/cxgbe/t4_main.c
==============================================================================
--- head/sys/dev/cxgbe/t4_main.c        Thu Aug  1 14:13:04 2019        
(r350500)
+++ head/sys/dev/cxgbe/t4_main.c        Thu Aug  1 14:17:31 2019        
(r350501)
@@ -1658,6 +1658,7 @@ cxgbe_vi_attach(device_t dev, struct vi_info *vi)
        ifp->if_snd_tag_modify = cxgbe_snd_tag_modify;
        ifp->if_snd_tag_query = cxgbe_snd_tag_query;
        ifp->if_snd_tag_free = cxgbe_snd_tag_free;
+       ifp->if_ratelimit_query = cxgbe_ratelimit_query;
 #endif
 
        ifp->if_capabilities = T4_CAP;

Modified: head/sys/dev/cxgbe/t4_sched.c
==============================================================================
--- head/sys/dev/cxgbe/t4_sched.c       Thu Aug  1 14:13:04 2019        
(r350500)
+++ head/sys/dev/cxgbe/t4_sched.c       Thu Aug  1 14:17:31 2019        
(r350501)
@@ -903,4 +903,35 @@ cxgbe_snd_tag_free(struct m_snd_tag *mst)
        }
        mtx_unlock(&cst->lock);
 }
+
+#define CXGBE_MAX_FLOWS 4000   /* Testing show so far thats all this adapter 
can do */
+#define CXGBE_UNIQUE_RATE_COUNT 16 /* Number of unique rates that can be setup 
*/
+
+void
+cxgbe_ratelimit_query(struct ifnet *ifp __unused,
+     struct if_ratelimit_query_results *q)
+{
+       /*
+        * This is a skeleton and needs future work
+        * by the driver supporters. It should be
+        * enhanced to look at the specific type of
+        * interface and select approprate values
+        * for these settings. This example goes
+        * with an earlier card (t5), it has a maximum
+        * number of 16 rates that the first guys in
+        * select (thus the flags value RT_IS_SELECTABLE).
+        * If it was a fixed table then we would setup a
+        * const array (example mlx5). Note the card tested
+        * can only support reasonably 4000 flows before
+        * the adapter has issues with sending so here 
+        * we limit the number of flows using hardware
+        * pacing to that number, other cards may
+        * be able to raise or eliminate this limit.
+        */
+       q->rate_table = NULL;
+       q->flags = RT_IS_SELECTABLE;
+       q->max_flows = CXGBE_MAX_FLOWS;
+       q->number_of_rates = CXGBE_UNIQUE_RATE_COUNT;
+       q->min_segment_burst = 4;       /* Driver emits 4 in a burst */
+}
 #endif

Modified: head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
==============================================================================
--- head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c    Thu Aug  1 14:13:04 2019        
(r350500)
+++ head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c    Thu Aug  1 14:17:31 2019        
(r350501)
@@ -4070,7 +4070,49 @@ mlx5e_snd_tag_query(struct m_snd_tag *pmt, union if_sn
        }
 }
 
+#define NUM_HDWR_RATES_MLX 13
+static const uint64_t adapter_rates_mlx[NUM_HDWR_RATES_MLX] = {
+       135375,                 /* 1,083,000 */
+       180500,                 /* 1,444,000 */
+       270750,                 /* 2,166,000 */
+       361000,                 /* 2,888,000 */
+       541500,                 /* 4,332,000 */
+       721875,                 /* 5,775,000 */
+       1082875,                /* 8,663,000 */
+       1443875,                /* 11,551,000 */
+       2165750,                /* 17,326,000 */
+       2887750,                /* 23,102,000 */
+       4331625,                /* 34,653,000 */
+       5775500,                /* 46,204,000 */
+       8663125                 /* 69,305,000 */
+};
+
 static void
+mlx5e_ratelimit_query(struct ifnet *ifp __unused, struct 
if_ratelimit_query_results *q)
+{
+       /*
+        * This function needs updating by the driver maintainer!
+        * For the MLX card there are currently (ConectX-4?) 13 
+        * pre-set rates and others i.e. ConnectX-5, 6, 7??
+        *
+        * This will change based on later adapters
+        * and this code should be updated to look at ifp
+        * and figure out the specific adapter type
+        * settings i.e. how many rates as well
+        * as if they are fixed (as is shown here) or
+        * if they are dynamic (example chelsio t4). Also if there
+        * is a maximum number of flows that the adapter
+        * can handle that too needs to be updated in
+        * the max_flows field.
+        */
+       q->rate_table = adapter_rates_mlx;
+       q->flags = RT_IS_FIXED_TABLE;
+       q->max_flows = 0;       /* mlx has no limit */
+       q->number_of_rates = NUM_HDWR_RATES_MLX;
+       q->min_segment_burst = 1;
+}
+
+static void
 mlx5e_snd_tag_free(struct m_snd_tag *pmt)
 {
        struct mlx5e_snd_tag *tag =
@@ -4155,7 +4197,9 @@ mlx5e_create_ifp(struct mlx5_core_dev *mdev)
        ifp->if_snd_tag_free = mlx5e_snd_tag_free;
        ifp->if_snd_tag_modify = mlx5e_snd_tag_modify;
        ifp->if_snd_tag_query = mlx5e_snd_tag_query;
-
+#ifdef RATELIMIT
+       ifp->if_ratelimit_query = mlx5e_ratelimit_query;
+#endif
        /* set TSO limits so that we don't have to drop TX packets */
        ifp->if_hw_tsomax = MLX5E_MAX_TX_PAYLOAD_SIZE - (ETHER_HDR_LEN + 
ETHER_VLAN_ENCAP_LEN);
        ifp->if_hw_tsomaxsegcount = MLX5E_MAX_TX_MBUF_FRAGS - 1 /* hdr */;

Modified: head/sys/net/if_dead.c
==============================================================================
--- head/sys/net/if_dead.c      Thu Aug  1 14:13:04 2019        (r350500)
+++ head/sys/net/if_dead.c      Thu Aug  1 14:17:31 2019        (r350501)
@@ -126,6 +126,23 @@ ifdead_snd_tag_free(struct m_snd_tag *pmt)
 {
 }
 
+static void
+ifdead_ratelimit_query(struct ifnet *ifp __unused,
+      struct if_ratelimit_query_results *q)
+{
+       /*
+        * This guy does not support
+        * this interface. Not sure
+        * why we would specify a
+        * flag on the interface
+        * that says we do.
+        */
+       q->rate_table = NULL;
+       q->flags = RT_NOSUPPORT;
+       q->max_flows = 0;
+       q->number_of_rates = 0;
+}
+
 void
 if_dead(struct ifnet *ifp)
 {
@@ -142,4 +159,5 @@ if_dead(struct ifnet *ifp)
        ifp->if_snd_tag_modify = ifdead_snd_tag_modify;
        ifp->if_snd_tag_query = ifdead_snd_tag_query;
        ifp->if_snd_tag_free = ifdead_snd_tag_free;
+       ifp->if_ratelimit_query = ifdead_ratelimit_query;
 }

Modified: head/sys/net/if_lagg.c
==============================================================================
--- head/sys/net/if_lagg.c      Thu Aug  1 14:13:04 2019        (r350500)
+++ head/sys/net/if_lagg.c      Thu Aug  1 14:17:31 2019        (r350501)
@@ -144,6 +144,8 @@ static int  lagg_snd_tag_modify(struct m_snd_tag *,
 static int     lagg_snd_tag_query(struct m_snd_tag *,
                    union if_snd_tag_query_params *);
 static void    lagg_snd_tag_free(struct m_snd_tag *);
+static void     lagg_ratelimit_query(struct ifnet *,
+                   struct if_ratelimit_query_results *);
 #endif
 static int     lagg_setmulti(struct lagg_port *);
 static int     lagg_clrmulti(struct lagg_port *);
@@ -537,6 +539,7 @@ lagg_clone_create(struct if_clone *ifc, int unit, cadd
        ifp->if_snd_tag_modify = lagg_snd_tag_modify;
        ifp->if_snd_tag_query = lagg_snd_tag_query;
        ifp->if_snd_tag_free = lagg_snd_tag_free;
+       ifp->if_ratelimit_query = lagg_ratelimit_query;
 #endif
        ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS;
 
@@ -1670,6 +1673,20 @@ lagg_snd_tag_free(struct m_snd_tag *mst)
        free(lst, M_LAGG);
 }
 
+static void
+lagg_ratelimit_query(struct ifnet *ifp __unused, struct 
if_ratelimit_query_results *q)
+{
+       /*
+        * For lagg, we have an indirect
+        * interface. The caller needs to
+        * get a ratelimit tag on the actual
+        * interface the flow will go on.
+        */
+       q->rate_table = NULL;
+       q->flags = RT_IS_INDIRECT;
+       q->max_flows = 0;
+       q->number_of_rates = 0;
+}
 #endif
 
 static int

Modified: head/sys/net/if_var.h
==============================================================================
--- head/sys/net/if_var.h       Thu Aug  1 14:13:04 2019        (r350500)
+++ head/sys/net/if_var.h       Thu Aug  1 14:17:31 2019        (r350501)
@@ -203,6 +203,8 @@ struct if_snd_tag_alloc_header {
 struct if_snd_tag_alloc_rate_limit {
        struct if_snd_tag_alloc_header hdr;
        uint64_t max_rate;      /* in bytes/s */
+       uint32_t flags;         /* M_NOWAIT or M_WAITOK */
+       uint32_t reserved;      /* alignment */
 };
 
 struct if_snd_tag_rate_limit_params {
@@ -210,7 +212,7 @@ struct if_snd_tag_rate_limit_params {
        uint32_t queue_level;   /* 0 (empty) .. 65535 (full) */
 #define        IF_SND_QUEUE_LEVEL_MIN 0
 #define        IF_SND_QUEUE_LEVEL_MAX 65535
-       uint32_t reserved;      /* padding */
+       uint32_t flags;         /* M_NOWAIT or M_WAITOK */
 };
 
 union if_snd_tag_alloc_params {
@@ -229,12 +231,38 @@ union if_snd_tag_query_params {
        struct if_snd_tag_rate_limit_params unlimited;
 };
 
+/* Query return flags */
+#define RT_NOSUPPORT     0x00000000    /* Not supported */
+#define RT_IS_INDIRECT    0x00000001   /*
+                                        * Interface like a lagg, select
+                                        * the actual interface for
+                                        * capabilities.
+                                        */
+#define RT_IS_SELECTABLE  0x00000002   /*
+                                        * No rate table, you select
+                                        * rates and the first
+                                        * number_of_rates are created.
+                                        */
+#define RT_IS_FIXED_TABLE 0x00000004   /* A fixed table is attached */
+#define RT_IS_UNUSABLE   0x00000008    /* It is not usable for this */
+
+struct if_ratelimit_query_results {
+       const uint64_t *rate_table;     /* Pointer to table if present */
+       uint32_t flags;                 /* Flags indicating results */
+       uint32_t max_flows;             /* Max flows using, 0=unlimited */
+       uint32_t number_of_rates;       /* How many unique rates can be created 
*/
+       uint32_t min_segment_burst;     /* The amount the adapter bursts at 
each send */
+};
+
 typedef int (if_snd_tag_alloc_t)(struct ifnet *, union if_snd_tag_alloc_params 
*,
     struct m_snd_tag **);
 typedef int (if_snd_tag_modify_t)(struct m_snd_tag *, union 
if_snd_tag_modify_params *);
 typedef int (if_snd_tag_query_t)(struct m_snd_tag *, union 
if_snd_tag_query_params *);
 typedef void (if_snd_tag_free_t)(struct m_snd_tag *);
+typedef void (if_ratelimit_query_t)(struct ifnet *,
+    struct if_ratelimit_query_results *);
 
+
 /*
  * Structure defining a network interface.
  */
@@ -374,6 +402,7 @@ struct ifnet {
        if_snd_tag_modify_t *if_snd_tag_modify;
        if_snd_tag_query_t *if_snd_tag_query;
        if_snd_tag_free_t *if_snd_tag_free;
+       if_ratelimit_query_t *if_ratelimit_query;
 
        /* Ethernet PCP */
        uint8_t if_pcp;

Modified: head/sys/netinet/in_pcb.c
==============================================================================
--- head/sys/netinet/in_pcb.c   Thu Aug  1 14:13:04 2019        (r350500)
+++ head/sys/netinet/in_pcb.c   Thu Aug  1 14:17:31 2019        (r350501)
@@ -210,6 +210,22 @@ SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtim
        &VNET_NAME(ipport_randomtime), 0,
        "Minimum time to keep sequental port "
        "allocation before switching to a random one");
+
+#ifdef RATELIMIT
+counter_u64_t rate_limit_active;
+counter_u64_t rate_limit_alloc_fail;
+counter_u64_t rate_limit_set_ok;
+
+static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD, 0,
+    "IP Rate Limiting");
+SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD,
+    &rate_limit_active, "Active rate limited connections");
+SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD,
+   &rate_limit_alloc_fail, "Rate limited connection failures");
+SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD,
+   &rate_limit_set_ok, "Rate limited setting succeeded");
+#endif /* RATELIMIT */
+
 #endif /* INET */
 
 /*
@@ -3170,6 +3186,7 @@ in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_p
 {
        union if_snd_tag_modify_params params = {
                .rate_limit.max_rate = max_pacing_rate,
+               .rate_limit.flags = M_NOWAIT,
        };
        struct m_snd_tag *mst;
        struct ifnet *ifp;
@@ -3256,7 +3273,8 @@ in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_tx
  */
 int
 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
-    uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate)
+    uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct 
m_snd_tag **st)
+
 {
        union if_snd_tag_alloc_params params = {
                .rate_limit.hdr.type = (max_pacing_rate == -1U) ?
@@ -3264,22 +3282,47 @@ in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *
                .rate_limit.hdr.flowid = flowid,
                .rate_limit.hdr.flowtype = flowtype,
                .rate_limit.max_rate = max_pacing_rate,
+               .rate_limit.flags = M_NOWAIT,
        };
        int error;
 
        INP_WLOCK_ASSERT(inp);
 
-       if (inp->inp_snd_tag != NULL)
+       if (*st != NULL)
                return (EINVAL);
 
        if (ifp->if_snd_tag_alloc == NULL) {
                error = EOPNOTSUPP;
        } else {
                error = ifp->if_snd_tag_alloc(ifp, &params, &inp->inp_snd_tag);
+
+               if (error == 0) {
+                       counter_u64_add(rate_limit_set_ok, 1);
+                       counter_u64_add(rate_limit_active, 1);
+               } else
+                       counter_u64_add(rate_limit_alloc_fail, 1);
        }
        return (error);
 }
 
+void
+in_pcbdetach_tag(struct ifnet *ifp, struct m_snd_tag *mst)
+{
+       if (ifp == NULL)
+               return;
+
+       /*
+        * If the device was detached while we still had reference(s)
+        * on the ifp, we assume if_snd_tag_free() was replaced with
+        * stubs.
+        */
+       ifp->if_snd_tag_free(mst);
+
+       /* release reference count on network interface */
+       if_rele(ifp);
+       counter_u64_add(rate_limit_active, -1);
+}
+
 /*
  * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
  * if any:
@@ -3300,49 +3343,12 @@ in_pcbdetach_txrtlmt(struct inpcb *inp)
        m_snd_tag_rele(mst);
 }
 
-/*
- * This function should be called when the INP_RATE_LIMIT_CHANGED flag
- * is set in the fast path and will attach/detach/modify the TX rate
- * limit send tag based on the socket's so_max_pacing_rate value.
- */
-void
-in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
+int
+in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf 
*mb, uint32_t max_pacing_rate)
 {
-       struct socket *socket;
-       uint32_t max_pacing_rate;
-       bool did_upgrade;
        int error;
 
-       if (inp == NULL)
-               return;
-
-       socket = inp->inp_socket;
-       if (socket == NULL)
-               return;
-
-       if (!INP_WLOCKED(inp)) {
-               /*
-                * NOTE: If the write locking fails, we need to bail
-                * out and use the non-ratelimited ring for the
-                * transmit until there is a new chance to get the
-                * write lock.
-                */
-               if (!INP_TRY_UPGRADE(inp))
-                       return;
-               did_upgrade = 1;
-       } else {
-               did_upgrade = 0;
-       }
-
        /*
-        * NOTE: The so_max_pacing_rate value is read unlocked,
-        * because atomic updates are not required since the variable
-        * is checked at every mbuf we send. It is assumed that the
-        * variable read itself will be atomic.
-        */
-       max_pacing_rate = socket->so_max_pacing_rate;
-
-       /*
         * If the existing send tag is for the wrong interface due to
         * a route change, first drop the existing tag.  Set the
         * CHANGED flag so that we will keep trying to allocate a new
@@ -3376,13 +3382,61 @@ in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *
                        error = EAGAIN;
                } else {
                        error = in_pcbattach_txrtlmt(inp, ifp, 
M_HASHTYPE_GET(mb),
-                           mb->m_pkthdr.flowid, max_pacing_rate);
+                           mb->m_pkthdr.flowid, max_pacing_rate, 
&inp->inp_snd_tag);
                }
        } else {
                error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
        }
        if (error == 0 || error == EOPNOTSUPP)
                inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
+
+       return (error);
+}
+
+/*
+ * This function should be called when the INP_RATE_LIMIT_CHANGED flag
+ * is set in the fast path and will attach/detach/modify the TX rate
+ * limit send tag based on the socket's so_max_pacing_rate value.
+ */
+void
+in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
+{
+       struct socket *socket;
+       uint32_t max_pacing_rate;
+       bool did_upgrade;
+       int error;
+
+       if (inp == NULL)
+               return;
+
+       socket = inp->inp_socket;
+       if (socket == NULL)
+               return;
+
+       if (!INP_WLOCKED(inp)) {
+               /*
+                * NOTE: If the write locking fails, we need to bail
+                * out and use the non-ratelimited ring for the
+                * transmit until there is a new chance to get the
+                * write lock.
+                */
+               if (!INP_TRY_UPGRADE(inp))
+                       return;
+               did_upgrade = 1;
+       } else {
+               did_upgrade = 0;
+       }
+
+       /*
+        * NOTE: The so_max_pacing_rate value is read unlocked,
+        * because atomic updates are not required since the variable
+        * is checked at every mbuf we send. It is assumed that the
+        * variable read itself will be atomic.
+        */
+       max_pacing_rate = socket->so_max_pacing_rate;
+
+       error = in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate);
+
        if (did_upgrade)
                INP_DOWNGRADE(inp);
 }
@@ -3424,4 +3478,14 @@ in_pcboutput_eagain(struct inpcb *inp)
        if (did_upgrade)
                INP_DOWNGRADE(inp);
 }
+
+static void
+rl_init(void *st)
+{
+       rate_limit_active = counter_u64_alloc(M_WAITOK);
+       rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK);
+       rate_limit_set_ok = counter_u64_alloc(M_WAITOK);
+}
+
+SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL);
 #endif /* RATELIMIT */

Modified: head/sys/netinet/in_pcb.h
==============================================================================
--- head/sys/netinet/in_pcb.h   Thu Aug  1 14:13:04 2019        (r350500)
+++ head/sys/netinet/in_pcb.h   Thu Aug  1 14:17:31 2019        (r350501)
@@ -883,8 +883,13 @@ struct sockaddr *
        in_sockaddr(in_port_t port, struct in_addr *addr);
 void   in_pcbsosetlabel(struct socket *so);
 #ifdef RATELIMIT
-int    in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, 
uint32_t, uint32_t);
+int
+in_pcboutput_txrtlmt_locked(struct inpcb *, struct ifnet *,
+           struct mbuf *, uint32_t);
+int    in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t,
+           uint32_t, struct m_snd_tag **);
 void   in_pcbdetach_txrtlmt(struct inpcb *);
+void    in_pcbdetach_tag(struct ifnet *ifp, struct m_snd_tag *mst);
 int    in_pcbmodify_txrtlmt(struct inpcb *, uint32_t);
 int    in_pcbquery_txrtlmt(struct inpcb *, uint32_t *);
 int    in_pcbquery_txrlevel(struct inpcb *, uint32_t *);

Added: head/sys/netinet/tcp_ratelimit.c
==============================================================================
--- /dev/null   00:00:00 1970   (empty, because file is newly added)
+++ head/sys/netinet/tcp_ratelimit.c    Thu Aug  1 14:17:31 2019        
(r350501)
@@ -0,0 +1,1234 @@
+/*-
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 2018-2019
+ *     Netflix Inc.
+ *      All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+/**
+ * Author: Randall Stewart <r...@netflix.com>
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_tcpdebug.h"
+#include "opt_ratelimit.h"
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#ifdef KERN_TLS
+#include <sys/sockbuf_tls.h>
+#endif
+#include <sys/sysctl.h>
+#include <sys/eventhandler.h>
+#include <sys/mutex.h>
+#include <sys/ck.h>
+#define TCPSTATES              /* for logging */
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/tcp_var.h>
+#ifdef INET6
+#include <netinet6/tcp6_var.h>
+#endif
+#include <netinet/tcp_ratelimit.h>
+#ifndef USECS_IN_SECOND
+#define USECS_IN_SECOND 1000000
+#endif
+/*
+ * For the purposes of each send, what is the size
+ * of an ethernet frame.
+ */
+#ifndef ETHERNET_SEGMENT_SIZE
+#define ETHERNET_SEGMENT_SIZE 1500
+#endif
+MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory");
+#ifdef RATELIMIT
+
+#define COMMON_RATE 180500
+uint64_t desired_rates[] = {
+       62500,                  /* 500Kbps */
+       180500,                 /* 1.44Mpbs */
+       375000,                 /* 3Mbps */
+       500000,                 /* 4Mbps */
+       625000,                 /* 5Mbps */
+       750000,                 /* 6Mbps */
+       1000000,                /* 8Mbps */
+       1250000,                /* 10Mbps */
+       2500000,                /* 20Mbps */
+       3750000,                /* 30Mbps */
+       5000000,                /* 40Meg */
+       6250000,                /* 50Mbps */
+       12500000,               /* 100Mbps */
+       25000000,               /* 200Mbps */
+       50000000,               /* 400Mbps */
+       100000000,              /* 800Mbps */
+       12500,                  /* 100kbps */
+       25000,                  /* 200kbps */
+       875000,                 /* 7Mbps */
+       1125000,                /* 9Mbps */
+       1875000,                /* 15Mbps */
+       3125000,                /* 25Mbps */
+       8125000,                /* 65Mbps */
+       10000000,               /* 80Mbps */
+       18750000,               /* 150Mbps */
+       20000000,               /* 250Mbps */
+       37500000,               /* 350Mbps */
+       62500000,               /* 500Mbps */
+       78125000,               /* 625Mbps */
+       125000000,              /* 1Gbps */
+};
+#define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t))
+#define RS_ORDERED_COUNT 16    /*
+                                * Number that are in order
+                                * at the beginning of the table,
+                                * over this a sort is required.
+                                */
+#define RS_NEXT_ORDER_GROUP 16 /*
+                                * The point in our table where
+                                * we come fill in a second ordered
+                                * group (index wise means -1).
+                                */
+#define ALL_HARDWARE_RATES 1004 /*
+                                * 1Meg - 1Gig in 1 Meg steps
+                                * plus 100, 200k  and 500k and
+                                * 10Gig
+                                */
+
+#define RS_ONE_MEGABIT_PERSEC 1000000
+#define RS_ONE_GIGABIT_PERSEC 1000000000
+#define RS_TEN_GIGABIT_PERSEC 10000000000
+
+static struct head_tcp_rate_set int_rs;
+static struct mtx rs_mtx;
+uint32_t rs_number_alive;
+uint32_t rs_number_dead;
+
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW, 0,
+    "TCP Ratelimit stats");
+SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW,
+    &rs_number_alive, 0,
+    "Number of interfaces initialized for ratelimiting");
+SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW,
+    &rs_number_dead, 0,
+    "Number of interfaces departing from ratelimiting");
+
+static void
+rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set 
*rs)
+{
+       /*
+        * Add sysctl entries for thus interface.
+        */
+       if (rs->rs_flags & RS_INTF_NO_SUP) {
+               SYSCTL_ADD_S32(&rs->sysctl_ctx,
+                  SYSCTL_CHILDREN(rl_sysctl_root),
+                  OID_AUTO, "disable", CTLFLAG_RD,
+                  &rs->rs_disable, 0,
+                  "Disable this interface from new hdwr limiting?");
+       } else {
+               SYSCTL_ADD_S32(&rs->sysctl_ctx,
+                  SYSCTL_CHILDREN(rl_sysctl_root),
+                  OID_AUTO, "disable", CTLFLAG_RW,
+                  &rs->rs_disable, 0,
+                  "Disable this interface from new hdwr limiting?");
+       }
+       SYSCTL_ADD_S32(&rs->sysctl_ctx,
+           SYSCTL_CHILDREN(rl_sysctl_root),
+           OID_AUTO, "minseg", CTLFLAG_RW,
+           &rs->rs_min_seg, 0,
+           "What is the minimum we need to send on this interface?");
+       SYSCTL_ADD_U64(&rs->sysctl_ctx,
+           SYSCTL_CHILDREN(rl_sysctl_root),
+           OID_AUTO, "flow_limit", CTLFLAG_RW,
+           &rs->rs_flow_limit, 0,
+           "What is the limit for number of flows (0=unlimited)?");
+       SYSCTL_ADD_S32(&rs->sysctl_ctx,
+           SYSCTL_CHILDREN(rl_sysctl_root),
+           OID_AUTO, "highest", CTLFLAG_RD,
+           &rs->rs_highest_valid, 0,
+           "Highest valid rate");
+       SYSCTL_ADD_S32(&rs->sysctl_ctx,
+           SYSCTL_CHILDREN(rl_sysctl_root),
+           OID_AUTO, "lowest", CTLFLAG_RD,
+           &rs->rs_lowest_valid, 0,
+           "Lowest valid rate");
+       SYSCTL_ADD_S32(&rs->sysctl_ctx,
+           SYSCTL_CHILDREN(rl_sysctl_root),
+           OID_AUTO, "flags", CTLFLAG_RD,
+           &rs->rs_flags, 0,
+           "What lags are on the entry?");
+       SYSCTL_ADD_S32(&rs->sysctl_ctx,
+           SYSCTL_CHILDREN(rl_sysctl_root),
+           OID_AUTO, "numrates", CTLFLAG_RD,
+           &rs->rs_rate_cnt, 0,
+           "How many rates re there?");
+       SYSCTL_ADD_U64(&rs->sysctl_ctx,
+           SYSCTL_CHILDREN(rl_sysctl_root),
+           OID_AUTO, "flows_using", CTLFLAG_RD,
+           &rs->rs_flows_using, 0,
+           "How many flows are using this interface now?");
+#ifdef DETAILED_RATELIMIT_SYSCTL
+       if (rs->rs_rlt && rs->rs_rate_cnt > 0) {
+               /*  Lets display the rates */
+               int i;
+               struct sysctl_oid *rl_rates;
+               struct sysctl_oid *rl_rate_num;
+               char rate_num[16];
+               rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
+                                           SYSCTL_CHILDREN(rl_sysctl_root),
+                                           OID_AUTO,
+                                           "rate",
+                                           CTLFLAG_RW, 0,
+                                           "Ratelist");
+               for( i = 0; i < rs->rs_rate_cnt; i++) {
+                       sprintf(rate_num, "%d", i);
+                       rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
+                                           SYSCTL_CHILDREN(rl_rates),
+                                           OID_AUTO,
+                                           rate_num,
+                                           CTLFLAG_RW, 0,
+                                           "Individual Rate");
+                       SYSCTL_ADD_U32(&rs->sysctl_ctx,
+                                      SYSCTL_CHILDREN(rl_rate_num),
+                                      OID_AUTO, "flags", CTLFLAG_RD,
+                                      &rs->rs_rlt[i].flags, 0,
+                                      "Flags on this rate");
+                       SYSCTL_ADD_U32(&rs->sysctl_ctx,
+                                      SYSCTL_CHILDREN(rl_rate_num),
+                                      OID_AUTO, "pacetime", CTLFLAG_RD,
+                                      &rs->rs_rlt[i].time_between, 0,
+                                      "Time hardware inserts between 1500 byte 
sends");
+                       SYSCTL_ADD_U64(&rs->sysctl_ctx,
+                                      SYSCTL_CHILDREN(rl_rate_num),
+                                      OID_AUTO, "rate", CTLFLAG_RD,
+                                      &rs->rs_rlt[i].rate, 0,
+                                      "Rate in bytes per second");
+               }
+       }
+#endif
+}
+
+static void
+rs_destroy(epoch_context_t ctx)
+{
+       struct tcp_rate_set *rs;
+
+       rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx);
+       mtx_lock(&rs_mtx);
+       rs->rs_flags &= ~RS_FUNERAL_SCHD;
+       if (rs->rs_flows_using == 0) {
+               /*
+                * In theory its possible (but unlikely)
+                * that while the delete was occuring
+                * and we were applying the DEAD flag
+                * someone slipped in and found the
+                * interface in a lookup. While we
+                * decided rs_flows_using were 0 and
+                * scheduling the epoch_call, the other
+                * thread incremented rs_flow_using. This
+                * is because users have a pointer and
+                * we only use the rs_flows_using in an
+                * atomic fashion, i.e. the other entities
+                * are not protected. To assure this did
+                * not occur, we check rs_flows_using here
+                * before deleteing.
+                */
+               sysctl_ctx_free(&rs->sysctl_ctx);
+               free(rs->rs_rlt, M_TCPPACE);
+               free(rs, M_TCPPACE);
+               rs_number_dead--;
+       }
+       mtx_unlock(&rs_mtx);
+
+}
+
+extern counter_u64_t rate_limit_set_ok;
+extern counter_u64_t rate_limit_active;
+extern counter_u64_t rate_limit_alloc_fail;
+
+static int
+rl_attach_txrtlmt(struct ifnet *ifp,
+    uint32_t flowtype,
+    int flowid,
+    uint64_t cfg_rate,
+    struct m_snd_tag **tag)
+{
+       int error;
+       union if_snd_tag_alloc_params params = {
+               .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
+               .rate_limit.hdr.flowid = flowid,
+               .rate_limit.hdr.flowtype = flowtype,
+               .rate_limit.max_rate = cfg_rate,
+               .rate_limit.flags = M_NOWAIT,
+       };
+
+       if (ifp->if_snd_tag_alloc == NULL) {
+               error = EOPNOTSUPP;
+       } else {
+               error = ifp->if_snd_tag_alloc(ifp, &params, tag);
+               if (error == 0) {
+                       if_ref((*tag)->ifp);
+                       counter_u64_add(rate_limit_set_ok, 1);
+                       counter_u64_add(rate_limit_active, 1);
+               } else
+                       counter_u64_add(rate_limit_alloc_fail, 1);
+       }
+       return (error);
+}
+
+static void
+populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act)
+{
+       /*
+        * The internal table is "special", it
+        * is two seperate ordered tables that
+        * must be merged. We get here when the
+        * adapter specifies a number of rates that
+        * covers both ranges in the table in some
+        * form.
+        */
+       int i, at_low, at_high;
+       uint8_t low_disabled = 0, high_disabled = 0;
+
+       for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < 
rs->rs_rate_cnt; i++) {
+               rs->rs_rlt[i].flags = 0;
+               rs->rs_rlt[i].time_between = 0;
+               if ((low_disabled == 0) &&
+                   (high_disabled ||
+                    (rate_table_act[at_low] < rate_table_act[at_high]))) {
+                       rs->rs_rlt[i].rate = rate_table_act[at_low];
+                       at_low++;
+                       if (at_low == RS_NEXT_ORDER_GROUP)
+                               low_disabled = 1;
+               } else if (high_disabled == 0) {
+                       rs->rs_rlt[i].rate = rate_table_act[at_high];
+                       at_high++;
+                       if (at_high == MAX_HDWR_RATES)
+                               high_disabled = 1;
+               }
+       }
+}
+
+static struct tcp_rate_set *
+rt_setup_new_rs(struct ifnet *ifp, int *error)
+{
+       struct tcp_rate_set *rs;
+       const uint64_t *rate_table_act;
+       uint64_t lentim, res;
+       size_t sz;
+       uint32_t hash_type;
+       int i;
+       struct if_ratelimit_query_results rl;
+       struct sysctl_oid *rl_sysctl_root;
+       /*
+        * We expect to enter with the 
+        * mutex locked.
+        */
+
+       if (ifp->if_ratelimit_query == NULL) {
+               /*
+                * We can do nothing if we cannot
+                * get a query back from the driver.
+                */
+               return (NULL);
+       }
+       rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO);
+       if (rs == NULL) {
+               if (error)
+                       *error = ENOMEM;
+               return (NULL);
+       }
+       rl.flags = RT_NOSUPPORT;
+       ifp->if_ratelimit_query(ifp, &rl);
+       if (rl.flags & RT_IS_UNUSABLE) {
+               /* 
+                * The interface does not really support 
+                * the rate-limiting.
+                */
+               memset(rs, 0, sizeof(struct tcp_rate_set));
+               rs->rs_ifp = ifp;
+               rs->rs_if_dunit = ifp->if_dunit;
+               rs->rs_flags = RS_INTF_NO_SUP;
+               rs->rs_disable = 1;
+               rs_number_alive++;
+               sysctl_ctx_init(&rs->sysctl_ctx);
+               rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
+                   SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
+                   OID_AUTO,
+                   rs->rs_ifp->if_xname,
+                   CTLFLAG_RW, 0,
+                   "");
+               CK_LIST_INSERT_HEAD(&int_rs, rs, next);
+               /* Unlock to allow the sysctl stuff to allocate */
+               mtx_unlock(&rs_mtx);
+               rl_add_syctl_entries(rl_sysctl_root, rs);
+               /* re-lock for our caller */
+               mtx_lock(&rs_mtx);
+               return (rs);
+       } else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) {
+               memset(rs, 0, sizeof(struct tcp_rate_set));
+               rs->rs_ifp = ifp;
+               rs->rs_if_dunit = ifp->if_dunit;
+               rs->rs_flags = RS_IS_DEFF;
+               rs_number_alive++;
+               sysctl_ctx_init(&rs->sysctl_ctx);
+               rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
+                   SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
+                   OID_AUTO,
+                   rs->rs_ifp->if_xname,
+                   CTLFLAG_RW, 0,
+                   "");
+               CK_LIST_INSERT_HEAD(&int_rs, rs, next);
+               /* Unlock to allow the sysctl stuff to allocate */
+               mtx_unlock(&rs_mtx);
+               rl_add_syctl_entries(rl_sysctl_root, rs);
+               /* re-lock for our caller */
+               mtx_lock(&rs_mtx);
+               return (rs);
+       } else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) {
+               /* Mellanox most likely */
+               rs->rs_ifp = ifp;
+               rs->rs_if_dunit = ifp->if_dunit;
+               rs->rs_rate_cnt = rl.number_of_rates;
+               rs->rs_min_seg = rl.min_segment_burst;
+               rs->rs_highest_valid = 0;
+               rs->rs_flow_limit = rl.max_flows;
+               rs->rs_flags = RS_IS_INTF | RS_NO_PRE;
+               rs->rs_disable = 0;
+               rate_table_act = rl.rate_table;
+       } else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) {
+               /* Chelsio */
+               rs->rs_ifp = ifp;
+               rs->rs_if_dunit = ifp->if_dunit;
+               rs->rs_rate_cnt = rl.number_of_rates;
+               rs->rs_min_seg = rl.min_segment_burst;
+               rs->rs_disable = 0;
+               rs->rs_flow_limit = rl.max_flows;
+               rate_table_act = desired_rates;
+               if ((rs->rs_rate_cnt > MAX_HDWR_RATES) &&
+                   (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) {
+                       /*
+                        * Our desired table is not big
+                        * enough, do what we can.
+                        */
+                       rs->rs_rate_cnt = MAX_HDWR_RATES;
+                }
+               if (rs->rs_rate_cnt <= RS_ORDERED_COUNT)
+                       rs->rs_flags = RS_IS_INTF;
+               else
+                       rs->rs_flags = RS_IS_INTF | RS_INT_TBL;
+               if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)
+                       rs->rs_rate_cnt = ALL_HARDWARE_RATES;
+       } else {
+               printf("Interface:%s unit:%d not one known to have 
rate-limits\n",
+                   ifp->if_dname,
+                   ifp->if_dunit);
+               free(rs, M_TCPPACE);
+               return (NULL);
+       }
+       sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt;
+       rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT);
+       if (rs->rs_rlt == NULL) {
+               if (error)
+                       *error = ENOMEM;
+bail:
+               free(rs, M_TCPPACE);
+               return (NULL);
+       }
+       if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) {
+               /*
+                * The interface supports all
+                * the rates we could possibly want.
+                */
+               uint64_t rat;
+
+               rs->rs_rlt[0].rate = 12500;     /* 100k */
+               rs->rs_rlt[1].rate = 25000;     /* 200k */
+               rs->rs_rlt[2].rate = 62500;     /* 500k */
+               /* Note 125000 == 1Megabit
+                * populate 1Meg - 1000meg.
+                */
+               for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) {
+                       rs->rs_rlt[i].rate = rat;
+                       rat += 125000;
+               }
+               rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000;
+       } else if (rs->rs_flags & RS_INT_TBL) {
+               /* We populate this in a special way */
+               populate_canned_table(rs, rate_table_act);
+       } else {
+               /*

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to