Author: hselasky
Date: Wed Jan 18 13:31:17 2017
New Revision: 312379
URL: https://svnweb.freebsd.org/changeset/base/312379

Log:
  Implement kernel support for hardware rate limited sockets.
  
  - Add RATELIMIT kernel configuration keyword which must be set to
  enable the new functionality.
  
  - Add support for hardware driven, Receive Side Scaling, RSS aware, rate
  limited sendqueues and expose the functionality through the already
  established SO_MAX_PACING_RATE setsockopt(). The API support rates in
  the range from 1 to 4Gbytes/s which are suitable for regular TCP and
  UDP streams. The setsockopt(2) manual page has been updated.
  
  - Add rate limit function callback API to "struct ifnet" which supports
  the following operations: if_snd_tag_alloc(), if_snd_tag_modify(),
  if_snd_tag_query() and if_snd_tag_free().
  
  - Add support to ifconfig to view, set and clear the IFCAP_TXRTLMT
  flag, which tells if a network driver supports rate limiting or not.
  
  - This patch also adds support for rate limiting through VLAN and LAGG
  intermediate network devices.
  
  - How rate limiting works:
  
  1) The userspace application calls setsockopt() after accepting or
  making a new connection to set the rate which is then stored in the
  socket structure in the kernel. Later on when packets are transmitted
  a check is made in the transmit path for rate changes. A rate change
  implies a non-blocking ifp->if_snd_tag_alloc() call will be made to the
  destination network interface, which then sets up a custom sendqueue
  with the given rate limitation parameter. A "struct m_snd_tag" pointer is
  returned which serves as a "snd_tag" hint in the m_pkthdr for the
  subsequently transmitted mbufs.
  
  2) When the network driver sees the "m->m_pkthdr.snd_tag" different
  from NULL, it will move the packets into a designated rate limited sendqueue
  given by the snd_tag pointer. It is up to the individual drivers how the rate
  limited traffic will be rate limited.
  
  3) Route changes are detected by the NIC drivers in the ifp->if_transmit()
  routine when the ifnet pointer in the incoming snd_tag mismatches the
  one of the network interface. The network adapter frees the mbuf and
  returns EAGAIN which causes the ip_output() to release and clear the send
  tag. Upon next ip_output() a new "snd_tag" will be tried allocated.
  
  4) When the PCB is detached the custom sendqueue will be released by a
  non-blocking ifp->if_snd_tag_free() call to the currently bound network
  interface.
  
  Reviewed by:          wblock (manpages), adrian, gallatin, scottl (network)
  Differential Revision:        https://reviews.freebsd.org/D3687
  Sponsored by:         Mellanox Technologies
  MFC after:            3 months

Modified:
  head/lib/libc/sys/getsockopt.2
  head/sbin/ifconfig/ifconfig.8
  head/sbin/ifconfig/ifconfig.c
  head/sys/conf/NOTES
  head/sys/conf/config.mk
  head/sys/conf/kern.opts.mk
  head/sys/conf/options
  head/sys/kern/uipc_socket.c
  head/sys/modules/if_lagg/Makefile
  head/sys/modules/if_vlan/Makefile
  head/sys/net/ieee8023ad_lacp.c
  head/sys/net/ieee8023ad_lacp.h
  head/sys/net/if.h
  head/sys/net/if_dead.c
  head/sys/net/if_lagg.c
  head/sys/net/if_var.h
  head/sys/net/if_vlan.c
  head/sys/netinet/in_pcb.c
  head/sys/netinet/in_pcb.h
  head/sys/netinet/ip_output.c
  head/sys/netinet6/ip6_output.c
  head/sys/sys/mbuf.h
  head/sys/sys/socket.h
  head/sys/sys/socketvar.h

Modified: head/lib/libc/sys/getsockopt.2
==============================================================================
--- head/lib/libc/sys/getsockopt.2      Wed Jan 18 13:27:24 2017        
(r312378)
+++ head/lib/libc/sys/getsockopt.2      Wed Jan 18 13:31:17 2017        
(r312379)
@@ -28,7 +28,7 @@
 .\"     @(#)getsockopt.2       8.4 (Berkeley) 5/2/95
 .\" $FreeBSD$
 .\"
-.Dd April 5, 2013
+.Dd January 18, 2017
 .Dt GETSOCKOPT 2
 .Os
 .Sh NAME
@@ -188,6 +188,7 @@ The following options are recognized in
 .It Dv SO_LISTENINCQLEN Ta "get incomplete queue length of the socket (get 
only)"
 .It Dv SO_USER_COOKIE Ta "set the 'so_user_cookie' value for the socket 
(uint32_t, set only)"
 .It Dv SO_TS_CLOCK Ta "set specific format of timestamp returned by 
SO_TIMESTAMP"
+.It Dv SO_MAX_PACING_RATE "set the maximum transmit rate in bytes per second 
for the socket"
 .El
 .Pp
 .Dv SO_DEBUG
@@ -515,6 +516,10 @@ returns the maximal number of queued con
 returns the number of unaccepted complete connections.
 .Dv SO_LISTENINCQLEN
 returns the number of unaccepted incomplete connections.
+.Pp
+.Dv SO_MAX_PACING_RATE
+instruct the socket and underlying network adapter layers to limit the
+transfer rate to the given unsigned 32-bit value in bytes per second.
 .Sh RETURN VALUES
 .Rv -std
 .Sh ERRORS

Modified: head/sbin/ifconfig/ifconfig.8
==============================================================================
--- head/sbin/ifconfig/ifconfig.8       Wed Jan 18 13:27:24 2017        
(r312378)
+++ head/sbin/ifconfig/ifconfig.8       Wed Jan 18 13:31:17 2017        
(r312379)
@@ -28,7 +28,7 @@
 .\"     From: @(#)ifconfig.8   8.3 (Berkeley) 1/5/94
 .\" $FreeBSD$
 .\"
-.Dd September 17, 2016
+.Dd January 18, 2017
 .Dt IFCONFIG 8
 .Os
 .Sh NAME
@@ -460,6 +460,8 @@ this directive is used to select between
 and 802.11g
 .Pq Cm 11g
 operating modes.
+.It Cm txrtlmt
+Set if the driver supports TX rate limiting.
 .It Cm inst Ar minst , Cm instance Ar minst
 Set the media instance to
 .Ar minst .

Modified: head/sbin/ifconfig/ifconfig.c
==============================================================================
--- head/sbin/ifconfig/ifconfig.c       Wed Jan 18 13:27:24 2017        
(r312378)
+++ head/sbin/ifconfig/ifconfig.c       Wed Jan 18 13:31:17 2017        
(r312379)
@@ -1145,7 +1145,7 @@ unsetifdescr(const char *val, int value,
 "\020\1RXCSUM\2TXCSUM\3NETCONS\4VLAN_MTU\5VLAN_HWTAGGING\6JUMBO_MTU\7POLLING" \
 "\10VLAN_HWCSUM\11TSO4\12TSO6\13LRO\14WOL_UCAST\15WOL_MCAST\16WOL_MAGIC" \
 "\17TOE4\20TOE6\21VLAN_HWFILTER\23VLAN_HWTSO\24LINKSTATE\25NETMAP" \
-"\26RXCSUM_IPV6\27TXCSUM_IPV6"
+"\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT"
 
 /*
  * Print the status of the interface.  If an address family was
@@ -1453,6 +1453,8 @@ static struct cmd basic_cmds[] = {
        DEF_CMD("-wol_mcast",   -IFCAP_WOL_MCAST,       setifcap),
        DEF_CMD("wol_magic",    IFCAP_WOL_MAGIC,        setifcap),
        DEF_CMD("-wol_magic",   -IFCAP_WOL_MAGIC,       setifcap),
+       DEF_CMD("txrtlmt",      IFCAP_TXRTLMT,  setifcap),
+       DEF_CMD("-txrtlmt",     -IFCAP_TXRTLMT, setifcap),
        DEF_CMD("normal",       -IFF_LINK0,     setifflags),
        DEF_CMD("compress",     IFF_LINK0,      setifflags),
        DEF_CMD("noicmp",       IFF_LINK1,      setifflags),

Modified: head/sys/conf/NOTES
==============================================================================
--- head/sys/conf/NOTES Wed Jan 18 13:27:24 2017        (r312378)
+++ head/sys/conf/NOTES Wed Jan 18 13:31:17 2017        (r312379)
@@ -619,6 +619,8 @@ options     HWPMC_HOOKS             # Other necessary 
 options        INET                    #Internet communications protocols
 options        INET6                   #IPv6 communications protocols
 
+options                RATELIMIT               # TX rate limiting support
+
 options        ROUTETABLES=2           # allocated fibs up to 65536. default 
is 1.
                                        # but that would be a bad idea as they 
are large.
 

Modified: head/sys/conf/config.mk
==============================================================================
--- head/sys/conf/config.mk     Wed Jan 18 13:27:24 2017        (r312378)
+++ head/sys/conf/config.mk     Wed Jan 18 13:31:17 2017        (r312379)
@@ -19,6 +19,10 @@ opt_inet.h:
 opt_inet6.h:
        @echo "#define INET6 1" > ${.TARGET}
 .endif
+.if ${MK_RATELIMIT} != "no"
+opt_ratelimit.h:
+       @echo "#define RATELIMIT 1" > ${.TARGET}
+.endif
 .if ${MK_EISA} != "no"
 opt_eisa.h:
        @echo "#define DEV_EISA 1" > ${.TARGET}

Modified: head/sys/conf/kern.opts.mk
==============================================================================
--- head/sys/conf/kern.opts.mk  Wed Jan 18 13:27:24 2017        (r312378)
+++ head/sys/conf/kern.opts.mk  Wed Jan 18 13:31:17 2017        (r312379)
@@ -48,6 +48,7 @@ __DEFAULT_NO_OPTIONS = \
     EXTRA_TCP_STACKS \
     NAND \
     OFED \
+    RATELIMIT \
     REPRODUCIBLE_BUILD
 
 # Some options are totally broken on some architectures. We disable

Modified: head/sys/conf/options
==============================================================================
--- head/sys/conf/options       Wed Jan 18 13:27:24 2017        (r312378)
+++ head/sys/conf/options       Wed Jan 18 13:31:17 2017        (r312379)
@@ -412,6 +412,7 @@ BOOTP_NFSV3         opt_bootp.h
 BOOTP_WIRED_TO         opt_bootp.h
 DEVICE_POLLING
 DUMMYNET               opt_ipdn.h
+RATELIMIT              opt_ratelimit.h
 INET                   opt_inet.h
 INET6                  opt_inet6.h
 IPDIVERT

Modified: head/sys/kern/uipc_socket.c
==============================================================================
--- head/sys/kern/uipc_socket.c Wed Jan 18 13:27:24 2017        (r312378)
+++ head/sys/kern/uipc_socket.c Wed Jan 18 13:31:17 2017        (r312379)
@@ -2699,6 +2699,14 @@ sosetopt(struct socket *so, struct socko
                        so->so_ts_clock = optval;
                        break;
 
+               case SO_MAX_PACING_RATE:
+                       error = sooptcopyin(sopt, &val32, sizeof(val32),
+                           sizeof(val32));
+                       if (error)
+                               goto bad;
+                       so->so_max_pacing_rate = val32;
+                       break;
+
                default:
                        if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
                                error = hhook_run_socket(so, sopt,
@@ -2890,6 +2898,10 @@ integer:
                        optval = so->so_ts_clock;
                        goto integer;
 
+               case SO_MAX_PACING_RATE:
+                       optval = so->so_max_pacing_rate;
+                       goto integer;
+
                default:
                        if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
                                error = hhook_run_socket(so, sopt,

Modified: head/sys/modules/if_lagg/Makefile
==============================================================================
--- head/sys/modules/if_lagg/Makefile   Wed Jan 18 13:27:24 2017        
(r312378)
+++ head/sys/modules/if_lagg/Makefile   Wed Jan 18 13:31:17 2017        
(r312379)
@@ -2,6 +2,6 @@
 
 .PATH: ${.CURDIR}/../../net
 KMOD=  if_lagg
-SRCS=  if_lagg.c ieee8023ad_lacp.c opt_inet.h opt_inet6.h
+SRCS=  if_lagg.c ieee8023ad_lacp.c opt_inet.h opt_inet6.h opt_ratelimit.h
 
 .include <bsd.kmod.mk>

Modified: head/sys/modules/if_vlan/Makefile
==============================================================================
--- head/sys/modules/if_vlan/Makefile   Wed Jan 18 13:27:24 2017        
(r312378)
+++ head/sys/modules/if_vlan/Makefile   Wed Jan 18 13:31:17 2017        
(r312379)
@@ -4,6 +4,6 @@
 
 KMOD=  if_vlan
 SRCS=  if_vlan.c
-SRCS+= opt_inet.h opt_vlan.h
+SRCS+= opt_inet.h opt_vlan.h opt_ratelimit.h
 
 .include <bsd.kmod.mk>

Modified: head/sys/net/ieee8023ad_lacp.c
==============================================================================
--- head/sys/net/ieee8023ad_lacp.c      Wed Jan 18 13:27:24 2017        
(r312378)
+++ head/sys/net/ieee8023ad_lacp.c      Wed Jan 18 13:31:17 2017        
(r312379)
@@ -30,6 +30,8 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_ratelimit.h"
+
 #include <sys/param.h>
 #include <sys/callout.h>
 #include <sys/eventhandler.h>
@@ -853,6 +855,35 @@ lacp_select_tx_port(struct lagg_softc *s
 
        return (lp->lp_lagg);
 }
+
+#ifdef RATELIMIT
+struct lagg_port *
+lacp_select_tx_port_by_hash(struct lagg_softc *sc, uint32_t flowid)
+{
+       struct lacp_softc *lsc = LACP_SOFTC(sc);
+       struct lacp_portmap *pm;
+       struct lacp_port *lp;
+       uint32_t hash;
+
+       if (__predict_false(lsc->lsc_suppress_distributing)) {
+               LACP_DPRINTF((NULL, "%s: waiting transit\n", __func__));
+               return (NULL);
+       }
+
+       pm = &lsc->lsc_pmap[lsc->lsc_activemap];
+       if (pm->pm_count == 0) {
+               LACP_DPRINTF((NULL, "%s: no active aggregator\n", __func__));
+               return (NULL);
+       }
+
+       hash = flowid >> sc->flowid_shift;
+       hash %= pm->pm_count;
+       lp = pm->pm_map[hash];
+
+       return (lp->lp_lagg);
+}
+#endif
+
 /*
  * lacp_suppress_distributing: drop transmit packets for a while
  * to preserve packet ordering.

Modified: head/sys/net/ieee8023ad_lacp.h
==============================================================================
--- head/sys/net/ieee8023ad_lacp.h      Wed Jan 18 13:27:24 2017        
(r312378)
+++ head/sys/net/ieee8023ad_lacp.h      Wed Jan 18 13:31:17 2017        
(r312379)
@@ -284,6 +284,9 @@ struct lacp_softc {
 
 struct mbuf    *lacp_input(struct lagg_port *, struct mbuf *);
 struct lagg_port *lacp_select_tx_port(struct lagg_softc *, struct mbuf *);
+#ifdef RATELIMIT
+struct lagg_port *lacp_select_tx_port_by_hash(struct lagg_softc *, uint32_t);
+#endif
 void           lacp_attach(struct lagg_softc *);
 void           lacp_detach(void *);
 void           lacp_init(struct lagg_softc *);

Modified: head/sys/net/if.h
==============================================================================
--- head/sys/net/if.h   Wed Jan 18 13:27:24 2017        (r312378)
+++ head/sys/net/if.h   Wed Jan 18 13:31:17 2017        (r312379)
@@ -239,6 +239,7 @@ struct if_data {
 #define        IFCAP_RXCSUM_IPV6       0x200000  /* can offload checksum on 
IPv6 RX */
 #define        IFCAP_TXCSUM_IPV6       0x400000  /* can offload checksum on 
IPv6 TX */
 #define        IFCAP_HWSTATS           0x800000 /* manages counters internally 
*/
+#define        IFCAP_TXRTLMT           0x1000000 /* hardware supports TX rate 
limiting */
 
 #define IFCAP_HWCSUM_IPV6      (IFCAP_RXCSUM_IPV6 | IFCAP_TXCSUM_IPV6)
 

Modified: head/sys/net/if_dead.c
==============================================================================
--- head/sys/net/if_dead.c      Wed Jan 18 13:27:24 2017        (r312378)
+++ head/sys/net/if_dead.c      Wed Jan 18 13:31:17 2017        (r312379)
@@ -100,6 +100,30 @@ ifdead_get_counter(struct ifnet *ifp, if
        return (0);
 }
 
+static int
+ifdead_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params,
+    struct m_snd_tag **ppmt)
+{
+       return (EOPNOTSUPP);
+}
+
+static int
+ifdead_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params 
*params)
+{
+       return (EOPNOTSUPP);
+}
+
+static int
+ifdead_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params 
*params)
+{
+       return (EOPNOTSUPP);
+}
+
+static void
+ifdead_snd_tag_free(struct m_snd_tag *pmt)
+{
+}
+
 void
 if_dead(struct ifnet *ifp)
 {
@@ -112,4 +136,8 @@ if_dead(struct ifnet *ifp)
        ifp->if_qflush = ifdead_qflush;
        ifp->if_transmit = ifdead_transmit;
        ifp->if_get_counter = ifdead_get_counter;
+       ifp->if_snd_tag_alloc = ifdead_snd_tag_alloc;
+       ifp->if_snd_tag_modify = ifdead_snd_tag_modify;
+       ifp->if_snd_tag_query = ifdead_snd_tag_query;
+       ifp->if_snd_tag_free = ifdead_snd_tag_free;
 }

Modified: head/sys/net/if_lagg.c
==============================================================================
--- head/sys/net/if_lagg.c      Wed Jan 18 13:27:24 2017        (r312378)
+++ head/sys/net/if_lagg.c      Wed Jan 18 13:31:17 2017        (r312379)
@@ -23,6 +23,7 @@ __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
+#include "opt_ratelimit.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -118,6 +119,11 @@ static void        lagg_port2req(struct lagg_po
 static void    lagg_init(void *);
 static void    lagg_stop(struct lagg_softc *);
 static int     lagg_ioctl(struct ifnet *, u_long, caddr_t);
+#ifdef RATELIMIT
+static int     lagg_snd_tag_alloc(struct ifnet *,
+                   union if_snd_tag_alloc_params *,
+                   struct m_snd_tag **);
+#endif
 static int     lagg_ether_setmulti(struct lagg_softc *);
 static int     lagg_ether_cmdmulti(struct lagg_port *, int);
 static int     lagg_setflag(struct lagg_port *, int, int,
@@ -503,7 +509,12 @@ lagg_clone_create(struct if_clone *ifc, 
        ifp->if_ioctl = lagg_ioctl;
        ifp->if_get_counter = lagg_get_counter;
        ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST;
+#ifdef RATELIMIT
+       ifp->if_snd_tag_alloc = lagg_snd_tag_alloc;
+       ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS | 
IFCAP_TXRTLMT;
+#else
        ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS;
+#endif
 
        /*
         * Attach as an ordinary ethernet device, children will be attached
@@ -1549,6 +1560,52 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd
        return (error);
 }
 
+#ifdef RATELIMIT
+static int
+lagg_snd_tag_alloc(struct ifnet *ifp,
+    union if_snd_tag_alloc_params *params,
+    struct m_snd_tag **ppmt)
+{
+       struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
+       struct lagg_port *lp;
+       struct lagg_lb *lb;
+       uint32_t p;
+
+       switch (sc->sc_proto) {
+       case LAGG_PROTO_FAILOVER:
+               lp = lagg_link_active(sc, sc->sc_primary);
+               break;
+       case LAGG_PROTO_LOADBALANCE:
+               if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 ||
+                   params->hdr.flowtype == M_HASHTYPE_NONE)
+                       return (EOPNOTSUPP);
+               p = params->hdr.flowid >> sc->flowid_shift;
+               p %= sc->sc_count;
+               lb = (struct lagg_lb *)sc->sc_psc;
+               lp = lb->lb_ports[p];
+               lp = lagg_link_active(sc, lp);
+               break;
+       case LAGG_PROTO_LACP:
+               if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 ||
+                   params->hdr.flowtype == M_HASHTYPE_NONE)
+                       return (EOPNOTSUPP);
+               lp = lacp_select_tx_port_by_hash(sc, params->hdr.flowid);
+               break;
+       default:
+               return (EOPNOTSUPP);
+       }
+       if (lp == NULL)
+               return (EOPNOTSUPP);
+       ifp = lp->lp_ifp;
+       if (ifp == NULL || ifp->if_snd_tag_alloc == NULL ||
+           (ifp->if_capenable & IFCAP_TXRTLMT) == 0)
+               return (EOPNOTSUPP);
+
+       /* forward allocation request */
+       return (ifp->if_snd_tag_alloc(ifp, params, ppmt));
+}
+#endif
+
 static int
 lagg_ether_setmulti(struct lagg_softc *sc)
 {

Modified: head/sys/net/if_var.h
==============================================================================
--- head/sys/net/if_var.h       Wed Jan 18 13:27:24 2017        (r312378)
+++ head/sys/net/if_var.h       Wed Jan 18 13:31:17 2017        (r312379)
@@ -175,6 +175,49 @@ struct if_encap_req {
 
 #define        IFENCAP_FLAG_BROADCAST  0x02    /* Destination is broadcast */
 
+/*
+ * Network interface send tag support. The storage of "struct
+ * m_snd_tag" comes from the network driver and it is free to allocate
+ * as much additional space as it wants for its own use.
+ */
+struct m_snd_tag;
+
+#define        IF_SND_TAG_TYPE_RATE_LIMIT 0
+#define        IF_SND_TAG_TYPE_MAX 1
+
+struct if_snd_tag_alloc_header {
+       uint32_t type;          /* send tag type, see IF_SND_TAG_XXX */
+       uint32_t flowid;        /* mbuf hash value */
+       uint32_t flowtype;      /* mbuf hash type */
+};
+
+struct if_snd_tag_alloc_rate_limit {
+       struct if_snd_tag_alloc_header hdr;
+       uint64_t max_rate;      /* in bytes/s */
+};
+
+struct if_snd_tag_rate_limit_params {
+       uint64_t max_rate;      /* in bytes/s */
+};
+
+union if_snd_tag_alloc_params {
+       struct if_snd_tag_alloc_header hdr;
+       struct if_snd_tag_alloc_rate_limit rate_limit;
+};
+
+union if_snd_tag_modify_params {
+       struct if_snd_tag_rate_limit_params rate_limit;
+};
+
+union if_snd_tag_query_params {
+       struct if_snd_tag_rate_limit_params rate_limit;
+};
+
+typedef int (if_snd_tag_alloc_t)(struct ifnet *, union if_snd_tag_alloc_params 
*,
+    struct m_snd_tag **);
+typedef int (if_snd_tag_modify_t)(struct m_snd_tag *, union 
if_snd_tag_modify_params *);
+typedef int (if_snd_tag_query_t)(struct m_snd_tag *, union 
if_snd_tag_query_params *);
+typedef void (if_snd_tag_free_t)(struct m_snd_tag *);
 
 /*
  * Structure defining a network interface.
@@ -304,12 +347,19 @@ struct ifnet {
        u_int   if_hw_tsomaxsegsize;    /* TSO maximum segment size in bytes */
 
        /*
+        * Network adapter send tag support:
+        */
+       if_snd_tag_alloc_t *if_snd_tag_alloc;
+       if_snd_tag_modify_t *if_snd_tag_modify;
+       if_snd_tag_query_t *if_snd_tag_query;
+       if_snd_tag_free_t *if_snd_tag_free;
+
+       /*
         * Spare fields to be added before branching a stable branch, so
         * that structure can be enhanced without changing the kernel
         * binary interface.
         */
-       void    *if_pspare[4];          /* packet pacing / general use */
-       int     if_ispare[4];           /* packet pacing / general use */
+       int     if_ispare[4];           /* general use */
 };
 
 /* for compatibility with other BSDs */

Modified: head/sys/net/if_vlan.c
==============================================================================
--- head/sys/net/if_vlan.c      Wed Jan 18 13:27:24 2017        (r312378)
+++ head/sys/net/if_vlan.c      Wed Jan 18 13:31:17 2017        (r312379)
@@ -46,6 +46,7 @@ __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_vlan.h"
+#include "opt_ratelimit.h"
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
@@ -212,6 +213,10 @@ static     void trunk_destroy(struct ifvlant
 static void vlan_init(void *foo);
 static void vlan_input(struct ifnet *ifp, struct mbuf *m);
 static int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr);
+#ifdef RATELIMIT
+static int vlan_snd_tag_alloc(struct ifnet *,
+    union if_snd_tag_alloc_params *, struct m_snd_tag **);
+#endif
 static void vlan_qflush(struct ifnet *ifp);
 static int vlan_setflag(struct ifnet *ifp, int flag, int status,
     int (*func)(struct ifnet *, int));
@@ -971,6 +976,9 @@ vlan_clone_create(struct if_clone *ifc, 
        ifp->if_transmit = vlan_transmit;
        ifp->if_qflush = vlan_qflush;
        ifp->if_ioctl = vlan_ioctl;
+#ifdef RATELIMIT
+       ifp->if_snd_tag_alloc = vlan_snd_tag_alloc;
+#endif
        ifp->if_flags = VLAN_IFFLAGS;
        ether_ifattach(ifp, eaddr);
        /* Now undo some of the damage... */
@@ -1591,6 +1599,15 @@ vlan_capabilities(struct ifvlan *ifv)
                TOEDEV(ifp) = TOEDEV(p);
                ifp->if_capenable |= p->if_capenable & IFCAP_TOE;
        }
+
+#ifdef RATELIMIT
+       /*
+        * If the parent interface supports ratelimiting, so does the
+        * VLAN interface.
+        */
+       ifp->if_capabilities |= (p->if_capabilities & IFCAP_TXRTLMT);
+       ifp->if_capenable |= (p->if_capenable & IFCAP_TXRTLMT);
+#endif
 }
 
 static void
@@ -1801,3 +1818,19 @@ vlan_ioctl(struct ifnet *ifp, u_long cmd
 
        return (error);
 }
+
+#ifdef RATELIMIT
+static int
+vlan_snd_tag_alloc(struct ifnet *ifp,
+    union if_snd_tag_alloc_params *params,
+    struct m_snd_tag **ppmt)
+{
+
+       /* get trunk device */
+       ifp = vlan_trunkdev(ifp);
+       if (ifp == NULL || (ifp->if_capenable & IFCAP_TXRTLMT) == 0)
+               return (EOPNOTSUPP);
+       /* forward allocation request */
+       return (ifp->if_snd_tag_alloc(ifp, params, ppmt));
+}
+#endif

Modified: head/sys/netinet/in_pcb.c
==============================================================================
--- head/sys/netinet/in_pcb.c   Wed Jan 18 13:27:24 2017        (r312378)
+++ head/sys/netinet/in_pcb.c   Wed Jan 18 13:31:17 2017        (r312379)
@@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$");
 #include "opt_ipsec.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
+#include "opt_ratelimit.h"
 #include "opt_pcbgroup.h"
 #include "opt_rss.h"
 
@@ -57,6 +58,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
+#include <sys/sockio.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/refcount.h>
@@ -1140,6 +1142,10 @@ in_pcbdetach(struct inpcb *inp)
 
        KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
 
+#ifdef RATELIMIT
+       if (inp->inp_snd_tag != NULL)
+               in_pcbdetach_txrtlmt(inp);
+#endif
        inp->inp_socket->so_pcb = NULL;
        inp->inp_socket = NULL;
 }
@@ -2677,3 +2683,253 @@ DB_SHOW_COMMAND(inpcb, db_show_inpcb)
        db_print_inpcb(inp, "inpcb", 0);
 }
 #endif /* DDB */
+
+#ifdef RATELIMIT
+/*
+ * Modify TX rate limit based on the existing "inp->inp_snd_tag",
+ * if any.
+ */
+int
+in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
+{
+       union if_snd_tag_modify_params params = {
+               .rate_limit.max_rate = max_pacing_rate,
+       };
+       struct m_snd_tag *mst;
+       struct ifnet *ifp;
+       int error;
+
+       mst = inp->inp_snd_tag;
+       if (mst == NULL)
+               return (EINVAL);
+
+       ifp = mst->ifp;
+       if (ifp == NULL)
+               return (EINVAL);
+
+       if (ifp->if_snd_tag_modify == NULL) {
+               error = EOPNOTSUPP;
+       } else {
+               error = ifp->if_snd_tag_modify(mst, &params);
+       }
+       return (error);
+}
+
+/*
+ * Query existing TX rate limit based on the existing
+ * "inp->inp_snd_tag", if any.
+ */
+int
+in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
+{
+       union if_snd_tag_query_params params = { };
+       struct m_snd_tag *mst;
+       struct ifnet *ifp;
+       int error;
+
+       mst = inp->inp_snd_tag;
+       if (mst == NULL)
+               return (EINVAL);
+
+       ifp = mst->ifp;
+       if (ifp == NULL)
+               return (EINVAL);
+
+       if (ifp->if_snd_tag_query == NULL) {
+               error = EOPNOTSUPP;
+       } else {
+               error = ifp->if_snd_tag_query(mst, &params);
+               if (error == 0 &&  p_max_pacing_rate != NULL)
+                       *p_max_pacing_rate = params.rate_limit.max_rate;
+       }
+       return (error);
+}
+
+/*
+ * Allocate a new TX rate limit send tag from the network interface
+ * given by the "ifp" argument and save it in "inp->inp_snd_tag":
+ */
+int
+in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
+    uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate)
+{
+       union if_snd_tag_alloc_params params = {
+               .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
+               .rate_limit.hdr.flowid = flowid,
+               .rate_limit.hdr.flowtype = flowtype,
+               .rate_limit.max_rate = max_pacing_rate,
+       };
+       int error;
+
+       INP_WLOCK_ASSERT(inp);
+
+       if (inp->inp_snd_tag != NULL)
+               return (EINVAL);
+
+       if (ifp->if_snd_tag_alloc == NULL) {
+               error = EOPNOTSUPP;
+       } else {
+               error = ifp->if_snd_tag_alloc(ifp, &params, &inp->inp_snd_tag);
+
+               /*
+                * At success increment the refcount on
+                * the send tag's network interface:
+                */
+               if (error == 0)
+                       if_ref(inp->inp_snd_tag->ifp);
+       }
+       return (error);
+}
+
+/*
+ * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
+ * if any:
+ */
+void
+in_pcbdetach_txrtlmt(struct inpcb *inp)
+{
+       struct m_snd_tag *mst;
+       struct ifnet *ifp;
+
+       INP_WLOCK_ASSERT(inp);
+
+       mst = inp->inp_snd_tag;
+       inp->inp_snd_tag = NULL;
+
+       if (mst == NULL)
+               return;
+
+       ifp = mst->ifp;
+       if (ifp == NULL)
+               return;
+
+       /*
+        * If the device was detached while we still had reference(s)
+        * on the ifp, we assume if_snd_tag_free() was replaced with
+        * stubs.
+        */
+       ifp->if_snd_tag_free(mst);
+
+       /* release reference count on network interface */
+       if_rele(ifp);
+}
+
+/*
+ * This function should be called when the INP_RATE_LIMIT_CHANGED flag
+ * is set in the fast path and will attach/detach/modify the TX rate
+ * limit send tag based on the socket's so_max_pacing_rate value.
+ */
+void
+in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
+{
+       struct socket *socket;
+       uint32_t max_pacing_rate;
+       bool did_upgrade;
+       int error;
+
+       if (inp == NULL)
+               return;
+
+       socket = inp->inp_socket;
+       if (socket == NULL)
+               return;
+
+       if (!INP_WLOCKED(inp)) {
+               /*
+                * NOTE: If the write locking fails, we need to bail
+                * out and use the non-ratelimited ring for the
+                * transmit until there is a new chance to get the
+                * write lock.
+                */
+               if (!INP_TRY_UPGRADE(inp))
+                       return;
+               did_upgrade = 1;
+       } else {
+               did_upgrade = 0;
+       }
+
+       /*
+        * NOTE: The so_max_pacing_rate value is read unlocked,
+        * because atomic updates are not required since the variable
+        * is checked at every mbuf we send. It is assumed that the
+        * variable read itself will be atomic.
+        */
+       max_pacing_rate = socket->so_max_pacing_rate;
+
+       /*
+        * NOTE: When attaching to a network interface a reference is
+        * made to ensure the network interface doesn't go away until
+        * all ratelimit connections are gone. The network interface
+        * pointers compared below represent valid network interfaces,
+        * except when comparing towards NULL.
+        */
+       if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
+               error = 0;
+       } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
+               if (inp->inp_snd_tag != NULL)
+                       in_pcbdetach_txrtlmt(inp);
+               error = 0;
+       } else if (inp->inp_snd_tag == NULL) {
+               /*
+                * In order to utilize packet pacing with RSS, we need
+                * to wait until there is a valid RSS hash before we
+                * can proceed:
+                */
+               if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
+                       error = EAGAIN;
+               } else {
+                       error = in_pcbattach_txrtlmt(inp, ifp, 
M_HASHTYPE_GET(mb),
+                           mb->m_pkthdr.flowid, max_pacing_rate);
+               }
+       } else {
+               error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
+       }
+       if (error == 0 || error == EOPNOTSUPP)
+               inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
+       if (did_upgrade)
+               INP_DOWNGRADE(inp);
+}
+
+/*
+ * Track route changes for TX rate limiting.
+ */
+void
+in_pcboutput_eagain(struct inpcb *inp)
+{
+       struct socket *socket;
+       bool did_upgrade;
+
+       if (inp == NULL)
+               return;
+
+       socket = inp->inp_socket;
+       if (socket == NULL)
+               return;
+
+       if (inp->inp_snd_tag == NULL)
+               return;
+
+       if (!INP_WLOCKED(inp)) {
+               /*
+                * NOTE: If the write locking fails, we need to bail
+                * out and use the non-ratelimited ring for the
+                * transmit until there is a new chance to get the
+                * write lock.
+                */
+               if (!INP_TRY_UPGRADE(inp))
+                       return;
+               did_upgrade = 1;
+       } else {
+               did_upgrade = 0;
+       }
+
+       /* detach rate limiting */
+       in_pcbdetach_txrtlmt(inp);
+
+       /* make sure new mbuf send tag allocation is made */
+       inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
+
+       if (did_upgrade)
+               INP_DOWNGRADE(inp);
+}
+#endif /* RATELIMIT */

Modified: head/sys/netinet/in_pcb.h
==============================================================================
--- head/sys/netinet/in_pcb.h   Wed Jan 18 13:27:24 2017        (r312378)
+++ head/sys/netinet/in_pcb.h   Wed Jan 18 13:31:17 2017        (r312379)
@@ -181,6 +181,7 @@ struct      icmp6_filter;
  * read-lock usage during modification, this model can be applied to other
  * protocols (especially SCTP).
  */
+struct m_snd_tag;
 struct inpcb {
        LIST_ENTRY(inpcb) inp_hash;     /* (h/i) hash list */
        LIST_ENTRY(inpcb) inp_pcbgrouphash;     /* (g/i) hash list */
@@ -202,11 +203,11 @@ struct inpcb {
        u_char  inp_ip_minttl;          /* (i) minimum TTL or drop */
        uint32_t inp_flowid;            /* (x) flow id / queue id */
        u_int   inp_refcount;           /* (i) refcount */
-       void    *inp_pspare[5];         /* (x) packet pacing / general use */
+       struct m_snd_tag *inp_snd_tag;  /* (i) send tag for outgoing mbufs */
+       void    *inp_pspare[4];         /* (x) general use */
        uint32_t inp_flowtype;          /* (x) M_HASHTYPE value */
        uint32_t inp_rss_listen_bucket; /* (x) overridden RSS listen bucket */
-       u_int   inp_ispare[4];          /* (x) packet pacing / user cookie /
-                                        *     general use */
+       u_int   inp_ispare[4];          /* (x) user cookie / general use */
 
        /* Local and foreign ports, local and foreign addr. */
        struct  in_conninfo inp_inc;    /* (i) list for PCB's local port */
@@ -616,6 +617,7 @@ short       inp_so_options(const struct inpcb 
 #define        INP_RSS_BUCKET_SET      0x00000080 /* IP_RSS_LISTEN_BUCKET is 
set */
 #define        INP_RECVFLOWID          0x00000100 /* populate recv datagram 
with flow info */
 #define        INP_RECVRSSBUCKETID     0x00000200 /* populate recv datagram 
with bucket id */
+#define        INP_RATE_LIMIT_CHANGED  0x00000400 /* rate limit needs 
attention */
 
 /*
  * Flags passed to in_pcblookup*() functions.
@@ -736,6 +738,14 @@ int        in_getsockaddr(struct socket *so, st
 struct sockaddr *
        in_sockaddr(in_port_t port, struct in_addr *addr);
 void   in_pcbsosetlabel(struct socket *so);
+#ifdef RATELIMIT
+int    in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, 
uint32_t, uint32_t);
+void   in_pcbdetach_txrtlmt(struct inpcb *);
+int    in_pcbmodify_txrtlmt(struct inpcb *, uint32_t);
+int    in_pcbquery_txrtlmt(struct inpcb *, uint32_t *);
+void   in_pcboutput_txrtlmt(struct inpcb *, struct ifnet *, struct mbuf *);
+void   in_pcboutput_eagain(struct inpcb *);
+#endif
 #endif /* _KERNEL */
 
 #endif /* !_NETINET_IN_PCB_H_ */

Modified: head/sys/netinet/ip_output.c
==============================================================================
--- head/sys/netinet/ip_output.c        Wed Jan 18 13:27:24 2017        
(r312378)
+++ head/sys/netinet/ip_output.c        Wed Jan 18 13:31:17 2017        
(r312379)
@@ -33,6 +33,7 @@
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
+#include "opt_ratelimit.h"
 #include "opt_ipsec.h"
 #include "opt_mbuf_stress_test.h"
 #include "opt_mpath.h"
@@ -661,8 +662,23 @@ sendit:
                 */
                m_clrprotoflags(m);
                IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
+#ifdef RATELIMIT
+               if (inp != NULL) {
+                       if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED)
+                               in_pcboutput_txrtlmt(inp, ifp, m);
+                       /* stamp send tag on mbuf */
+                       m->m_pkthdr.snd_tag = inp->inp_snd_tag;
+               } else {
+                       m->m_pkthdr.snd_tag = NULL;
+               }
+#endif
                error = (*ifp->if_output)(ifp, m,
                    (const struct sockaddr *)gw, ro);
+#ifdef RATELIMIT
+               /* check for route change */
+               if (error == EAGAIN)
+                       in_pcboutput_eagain(inp);
+#endif
                goto done;
        }
 
@@ -698,8 +714,23 @@ sendit:
 
                        IP_PROBE(send, NULL, NULL, mtod(m, struct ip *), ifp,
                            mtod(m, struct ip *), NULL);
+#ifdef RATELIMIT
+                       if (inp != NULL) {
+                               if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED)
+                                       in_pcboutput_txrtlmt(inp, ifp, m);
+                               /* stamp send tag on mbuf */
+                               m->m_pkthdr.snd_tag = inp->inp_snd_tag;
+                       } else {
+                               m->m_pkthdr.snd_tag = NULL;
+                       }
+#endif
                        error = (*ifp->if_output)(ifp, m,
                            (const struct sockaddr *)gw, ro);
+#ifdef RATELIMIT
+                       /* check for route change */
+                       if (error == EAGAIN)
+                               in_pcboutput_eagain(inp);
+#endif
                } else
                        m_freem(m);
        }
@@ -974,6 +1005,16 @@ ip_ctloutput(struct socket *so, struct s
                                INP_WUNLOCK(inp);
                                error = 0;
                                break;
+                       case SO_MAX_PACING_RATE:
+#ifdef RATELIMIT
+                               INP_WLOCK(inp);
+                               inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
+                               INP_WUNLOCK(inp);
+                               error = 0;
+#else
+                               error = EOPNOTSUPP;
+#endif
+                               break;
                        default:
                                break;
                        }

Modified: head/sys/netinet6/ip6_output.c
==============================================================================
--- head/sys/netinet6/ip6_output.c      Wed Jan 18 13:27:24 2017        
(r312378)
+++ head/sys/netinet6/ip6_output.c      Wed Jan 18 13:31:17 2017        
(r312379)
@@ -65,6 +65,7 @@ __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
+#include "opt_ratelimit.h"
 #include "opt_ipsec.h"
 #include "opt_sctp.h"
 #include "opt_route.h"
@@ -954,8 +955,23 @@ passout:
                            m->m_pkthdr.len);
                        ifa_free(&ia6->ia_ifa);
                }
+#ifdef RATELIMIT
+               if (inp != NULL) {
+                       if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED)
+                               in_pcboutput_txrtlmt(inp, ifp, m);
+                       /* stamp send tag on mbuf */
+                       m->m_pkthdr.snd_tag = inp->inp_snd_tag;
+               } else {
+                       m->m_pkthdr.snd_tag = NULL;
+               }
+#endif
                error = nd6_output_ifp(ifp, origifp, m, dst,
                    (struct route *)ro);
+#ifdef RATELIMIT
+               /* check for route change */
+               if (error == EAGAIN)
+                       in_pcboutput_eagain(inp);
+#endif
                goto done;
        }
 
@@ -1054,8 +1070,23 @@ sendorfree:
                                counter_u64_add(ia->ia_ifa.ifa_obytes,
                                    m->m_pkthdr.len);
                        }
+#ifdef RATELIMIT
+                       if (inp != NULL) {
+                               if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED)
+                                       in_pcboutput_txrtlmt(inp, ifp, m);
+                               /* stamp send tag on mbuf */
+                               m->m_pkthdr.snd_tag = inp->inp_snd_tag;
+                       } else {
+                               m->m_pkthdr.snd_tag = NULL;
+                       }
+#endif
                        error = nd6_output_ifp(ifp, origifp, m, dst,
                            (struct route *)ro);
+#ifdef RATELIMIT
+                       /* check for route change */
+                       if (error == EAGAIN)
+                               in_pcboutput_eagain(inp);
+#endif
                } else
                        m_freem(m);
        }
@@ -1441,6 +1472,16 @@ ip6_ctloutput(struct socket *so, struct 
                                INP_WUNLOCK(in6p);
                                error = 0;
                                break;
+                       case SO_MAX_PACING_RATE:
+#ifdef RATELIMIT
+                               INP_WLOCK(in6p);
+                               in6p->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
+                               INP_WUNLOCK(in6p);
+                               error = 0;
+#else
+                               error = EOPNOTSUPP;
+#endif
+                               break;
                        default:
                                break;
                        }

Modified: head/sys/sys/mbuf.h
==============================================================================
--- head/sys/sys/mbuf.h Wed Jan 18 13:27:24 2017        (r312378)
+++ head/sys/sys/mbuf.h Wed Jan 18 13:31:17 2017        (r312379)
@@ -130,6 +130,14 @@ struct m_tag {
 };
 
 /*
+ * Static network interface owned tag.
+ * Allocated through ifp->if_snd_tag_alloc().
+ */
+struct m_snd_tag {
+       struct ifnet *ifp;              /* network interface tag belongs to */
+};
+
+/*
  * Record/packet header in first mbuf of chain; valid only if M_PKTHDR is set.
  * Size ILP32: 48
  *      LP64: 56

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to