Author: trasz
Date: Mon Dec  2 20:58:04 2019
New Revision: 355304
URL: https://svnweb.freebsd.org/changeset/base/355304

Log:
  Make use of the stats(3) framework in the TCP stack.
  
  This makes it possible to retrieve per-connection statistical
  information such as the receive window size, RTT, or goodput,
  using a newly added TCP_STATS getsockopt(3) option, and extract
  them using the stats_voistat_fetch(3) API.
  
  See the net/tcprtt port for an example consumer of this API.
  
  Compared to the existing TCP_INFO system, the main differences
  are that this mechanism is easy to extend without breaking ABI,
  and provides statistical information instead of raw "snapshots"
  of values at a given point in time.  stats(3) is more generic
  and can be used in both userland and the kernel.
  
  Reviewed by:  thj
  Tested by:    thj
  Obtained from:        Netflix
  Relnotes:     yes
  Sponsored by: Klara Inc, Netflix
  Differential Revision:        https://reviews.freebsd.org/D20655

Added:
  head/sys/netinet/tcp_stats.c   (contents, props changed)
Modified:
  head/lib/libstats/Makefile
  head/share/man/man4/tcp.4
  head/sys/conf/files
  head/sys/netinet/cc/cc.h
  head/sys/netinet/tcp.h
  head/sys/netinet/tcp_input.c
  head/sys/netinet/tcp_log_buf.c
  head/sys/netinet/tcp_output.c
  head/sys/netinet/tcp_subr.c
  head/sys/netinet/tcp_usrreq.c
  head/sys/netinet/tcp_var.h
  head/sys/sys/stats.h

Modified: head/lib/libstats/Makefile
==============================================================================
--- head/lib/libstats/Makefile  Mon Dec  2 20:57:13 2019        (r355303)
+++ head/lib/libstats/Makefile  Mon Dec  2 20:58:04 2019        (r355304)
@@ -3,12 +3,12 @@
 LIB=           stats
 SHLIBDIR?=     /lib
 SHLIB_MAJOR=   0
-SRCS=          subr_stats.c
+SRCS=          subr_stats.c tcp_stats.c
 
 # To debug, comment WITHOUT_ASSERT_DEBUG= and uncomment CFLAGS:=
 WITHOUT_ASSERT_DEBUG=
 #CFLAGS:=${CFLAGS:C/-O[0-9]/-O0 -g3/} -DDIAGNOSTIC
 
-.PATH: ${.CURDIR}/../../sys/kern
+.PATH: ${.CURDIR}/../../sys/kern ${.CURDIR}/../../sys/netinet
 
 .include <bsd.lib.mk>

Modified: head/share/man/man4/tcp.4
==============================================================================
--- head/share/man/man4/tcp.4   Mon Dec  2 20:57:13 2019        (r355303)
+++ head/share/man/man4/tcp.4   Mon Dec  2 20:58:04 2019        (r355304)
@@ -34,7 +34,7 @@
 .\"     From: @(#)tcp.4        8.1 (Berkeley) 6/5/93
 .\" $FreeBSD$
 .\"
-.Dd December 1, 2019
+.Dd December 2, 2019
 .Dt TCP 4
 .Os
 .Sh NAME
@@ -291,6 +291,10 @@ This entry can only be specified on a per-host basis a
 .Pp
 If an SADB entry cannot be found for the destination,
 the system does not send any outgoing segments and drops any inbound segments.
+.It Dv TCP_STATS
+Manage collection of connection level statistics using the
+.Xr stats 3
+framework.
 .Pp
 Each dropped segment is taken into account in the TCP protocol statistics.
 .It Dv TCP_TXTLS_ENABLE
@@ -664,6 +668,17 @@ Default is false.
 When initializing the TCP timestamps, use a per connection offset instead of a
 per host pair offset.
 Default is to use per connection offsets as recommended in RFC 7323.
+.It Va perconn_stats_enable
+Controls the default collection of statistics for all connections using the
+.Xr stats 3
+framework.
+0 disables, 1 enables, 2 enables random sampling across log id connection
+groups with all connections in a group receiving the same setting.
+.It Va perconn_stats_sample_rates
+A CSV list of template_spec=percent key-value pairs which controls the per
+template sampling rates when
+.Xr stats 3
+sampling is enabled.
 .El
 .Sh ERRORS
 A socket operation may fail with one of the following errors returned:
@@ -703,6 +718,7 @@ when trying to use a TCP function block that is not av
 .Sh SEE ALSO
 .Xr getsockopt 2 ,
 .Xr socket 2 ,
+.Xr stats 3 ,
 .Xr sysctl 3 ,
 .Xr blackhole 4 ,
 .Xr inet 4 ,

Modified: head/sys/conf/files
==============================================================================
--- head/sys/conf/files Mon Dec  2 20:57:13 2019        (r355303)
+++ head/sys/conf/files Mon Dec  2 20:58:04 2019        (r355304)
@@ -4295,6 +4295,7 @@ netinet/tcp_pcap.c                optional inet tcppcap | 
inet6 tcpp
        compile-with "${NORMAL_C} ${NO_WNONNULL}"
 netinet/tcp_reass.c            optional inet | inet6
 netinet/tcp_sack.c             optional inet | inet6
+netinet/tcp_stats.c            optional stats inet | stats inet6
 netinet/tcp_subr.c             optional inet | inet6
 netinet/tcp_syncache.c         optional inet | inet6
 netinet/tcp_timer.c            optional inet | inet6

Modified: head/sys/netinet/cc/cc.h
==============================================================================
--- head/sys/netinet/cc/cc.h    Mon Dec  2 20:57:13 2019        (r355303)
+++ head/sys/netinet/cc/cc.h    Mon Dec  2 20:58:04 2019        (r355304)
@@ -51,9 +51,7 @@
 #ifndef _NETINET_CC_CC_H_
 #define _NETINET_CC_CC_H_
 
-#if !defined(_KERNEL)
-#error "no user-serviceable parts inside"
-#endif
+#ifdef _KERNEL
 
 /* Global CC vars. */
 extern STAILQ_HEAD(cc_head, cc_algo) cc_list;
@@ -108,6 +106,7 @@ struct cc_var {
 #define        CC_DUPACK       0x0002  /* Duplicate ACK. */
 #define        CC_PARTIALACK   0x0004  /* Not yet. */
 #define        CC_SACK         0x0008  /* Not yet. */
+#endif /* _KERNEL */
 
 /*
  * Congestion signal types passed to the cong_signal() hook. The highest order 
8
@@ -121,6 +120,7 @@ struct cc_var {
 
 #define        CC_SIGPRIVMASK  0xFF000000      /* Mask to check if sig is 
private. */
 
+#ifdef _KERNEL
 /*
  * Structure to hold data and function pointers that together represent a
  * congestion control algorithm.
@@ -184,4 +184,5 @@ extern struct rwlock cc_list_lock;
 
 #define CC_ALGOOPT_LIMIT       2048
 
+#endif /* _KERNEL */
 #endif /* _NETINET_CC_CC_H_ */

Modified: head/sys/netinet/tcp.h
==============================================================================
--- head/sys/netinet/tcp.h      Mon Dec  2 20:57:13 2019        (r355303)
+++ head/sys/netinet/tcp.h      Mon Dec  2 20:58:04 2019        (r355304)
@@ -168,6 +168,7 @@ struct tcphdr {
 #define TCP_NOOPT      8       /* don't use TCP options */
 #define TCP_MD5SIG     16      /* use MD5 digests (RFC2385) */
 #define        TCP_INFO        32      /* retrieve tcp_info structure */
+#define        TCP_STATS       33      /* retrieve stats blob structure */
 #define        TCP_LOG         34      /* configure event logging for 
connection */
 #define        TCP_LOGBUF      35      /* retrieve event log for connection */
 #define        TCP_LOGID       36      /* configure log ID to correlate 
connections */
@@ -363,5 +364,19 @@ struct tcp_function_set {
  * TCP Control message types
  */
 #define        TLS_SET_RECORD_TYPE     1
+
+/*
+ * TCP specific variables of interest for tp->t_stats stats(9) accounting.
+ */
+#define        VOI_TCP_TXPB            0 /* Transmit payload bytes */
+#define        VOI_TCP_RETXPB          1 /* Retransmit payload bytes */
+#define        VOI_TCP_FRWIN           2 /* Foreign receive window */
+#define        VOI_TCP_LCWIN           3 /* Local congesiton window */
+#define        VOI_TCP_RTT             4 /* Round trip time */
+#define        VOI_TCP_CSIG            5 /* Congestion signal */
+#define        VOI_TCP_GPUT            6 /* Goodput */
+#define        VOI_TCP_CALCFRWINDIFF   7 /* Congestion avoidance LCWIN - FRWIN 
*/
+#define        VOI_TCP_GPUT_ND         8 /* Goodput normalised delta */
+#define        VOI_TCP_ACKLEN          9 /* Average ACKed bytes per ACK */
 
 #endif /* !_NETINET_TCP_H_ */

Modified: head/sys/netinet/tcp_input.c
==============================================================================
--- head/sys/netinet/tcp_input.c        Mon Dec  2 20:57:13 2019        
(r355303)
+++ head/sys/netinet/tcp_input.c        Mon Dec  2 20:58:04 2019        
(r355304)
@@ -58,6 +58,7 @@ __FBSDID("$FreeBSD$");
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
+#include <sys/arb.h>
 #include <sys/kernel.h>
 #ifdef TCP_HHOOK
 #include <sys/hhook.h>
@@ -66,6 +67,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/mbuf.h>
 #include <sys/proc.h>          /* for proc0 declaration */
 #include <sys/protosw.h>
+#include <sys/qmath.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
@@ -73,6 +75,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
+#include <sys/stats.h>
 
 #include <machine/cpu.h>       /* before tcp_seq.h, for tcp_random18() */
 
@@ -298,6 +301,10 @@ void
 cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t nsegs,
     uint16_t type)
 {
+#ifdef STATS
+       int32_t gput;
+#endif
+
        INP_WLOCK_ASSERT(tp->t_inpcb);
 
        tp->ccv->nsegs = nsegs;
@@ -310,6 +317,35 @@ cc_ack_received(struct tcpcb *tp, struct tcphdr *th, u
                tp->ccv->flags &= ~CCF_CWND_LIMITED;
 
        if (type == CC_ACK) {
+#ifdef STATS
+               stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF,
+                   ((int32_t)tp->snd_cwnd) - tp->snd_wnd);
+               if (!IN_RECOVERY(tp->t_flags))
+                       stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_ACKLEN,
+                          tp->ccv->bytes_this_ack / (tcp_maxseg(tp) * nsegs));
+               if ((tp->t_flags & TF_GPUTINPROG) &&
+                   SEQ_GEQ(th->th_ack, tp->gput_ack)) {
+                       /*
+                        * Compute goodput in bits per millisecond.
+                        */
+                       gput = (((int64_t)(th->th_ack - tp->gput_seq)) << 3) /
+                           max(1, tcp_ts_getticks() - tp->gput_ts);
+                       stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
+                           gput);
+                       /*
+                        * XXXLAS: This is a temporary hack, and should be
+                        * chained off VOI_TCP_GPUT when stats(9) grows an API
+                        * to deal with chained VOIs.
+                        */
+                       if (tp->t_stats_gput_prev > 0)
+                               stats_voi_update_abs_s32(tp->t_stats,
+                                   VOI_TCP_GPUT_ND,
+                                   ((gput - tp->t_stats_gput_prev) * 100) /
+                                   tp->t_stats_gput_prev);
+                       tp->t_flags &= ~TF_GPUTINPROG;
+                       tp->t_stats_gput_prev = gput;
+               }
+#endif /* STATS */
                if (tp->snd_cwnd > tp->snd_ssthresh) {
                        tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
                             nsegs * V_tcp_abc_l_var * tcp_maxseg(tp));
@@ -328,6 +364,9 @@ cc_ack_received(struct tcpcb *tp, struct tcphdr *th, u
                tp->ccv->curack = th->th_ack;
                CC_ALGO(tp)->ack_received(tp->ccv, type);
        }
+#ifdef STATS
+       stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd);
+#endif
 }
 
 void 
@@ -393,6 +432,10 @@ cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, ui
 
        INP_WLOCK_ASSERT(tp->t_inpcb);
 
+#ifdef STATS
+       stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type);
+#endif
+
        switch(type) {
        case CC_NDUPACK:
                if (!IN_FASTRECOVERY(tp->t_flags)) {
@@ -1496,6 +1539,9 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, stru
         * For the SYN_SENT state the scale is zero.
         */
        tiwin = th->th_win << tp->snd_scale;
+#ifdef STATS
+       stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
+#endif
 
        /*
         * TCP ECN processing.
@@ -3359,6 +3405,10 @@ tcp_xmit_timer(struct tcpcb *tp, int rtt)
 
        TCPSTAT_INC(tcps_rttupdated);
        tp->t_rttupdated++;
+#ifdef STATS
+       stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT,
+           imax(0, rtt * 1000 / hz));
+#endif
        if ((tp->t_srtt != 0) && (tp->t_rxtshift <= TCP_RTT_INVALIDATE)) {
                /*
                 * srtt is stored as fixed point with 5 bits after the

Modified: head/sys/netinet/tcp_log_buf.c
==============================================================================
--- head/sys/netinet/tcp_log_buf.c      Mon Dec  2 20:57:13 2019        
(r355303)
+++ head/sys/netinet/tcp_log_buf.c      Mon Dec  2 20:58:04 2019        
(r355304)
@@ -30,10 +30,12 @@
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
+#include <sys/arb.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
+#include <sys/qmath.h>
 #include <sys/queue.h>
 #include <sys/refcount.h>
 #include <sys/rwlock.h>
@@ -41,6 +43,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/tree.h>
+#include <sys/stats.h>
 #include <sys/counter.h>
 
 #include <dev/tcp_log/tcp_log_dev.h>
@@ -475,7 +478,7 @@ tcp_log_grow_tlb(char *tlb_id, struct tcpcb *tp)
 
        INP_WLOCK_ASSERT(tp->t_inpcb);
 
-#ifdef NETFLIX
+#ifdef STATS
        if (V_tcp_perconn_stats_enable == 2 && tp->t_stats == NULL)
                (void)tcp_stats_sample_rollthedice(tp, tlb_id, strlen(tlb_id));
 #endif

Modified: head/sys/netinet/tcp_output.c
==============================================================================
--- head/sys/netinet/tcp_output.c       Mon Dec  2 20:57:13 2019        
(r355303)
+++ head/sys/netinet/tcp_output.c       Mon Dec  2 20:58:04 2019        
(r355304)
@@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/arb.h>
 #include <sys/domain.h>
 #ifdef TCP_HHOOK
 #include <sys/hhook.h>
@@ -54,10 +55,12 @@ __FBSDID("$FreeBSD$");
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/protosw.h>
+#include <sys/qmath.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
+#include <sys/stats.h>
 
 #include <net/if.h>
 #include <net/route.h>
@@ -991,15 +994,31 @@ send:
                struct sockbuf *msb;
                u_int moff;
 
-               if ((tp->t_flags & TF_FORCEDATA) && len == 1)
+               if ((tp->t_flags & TF_FORCEDATA) && len == 1) {
                        TCPSTAT_INC(tcps_sndprobe);
-               else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
+#ifdef STATS
+                       if (SEQ_LT(tp->snd_nxt, tp->snd_max))
+                               stats_voi_update_abs_u32(tp->t_stats,
+                               VOI_TCP_RETXPB, len);
+                       else
+                               stats_voi_update_abs_u64(tp->t_stats,
+                                   VOI_TCP_TXPB, len);
+#endif /* STATS */
+               } else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
                        tp->t_sndrexmitpack++;
                        TCPSTAT_INC(tcps_sndrexmitpack);
                        TCPSTAT_ADD(tcps_sndrexmitbyte, len);
+#ifdef STATS
+                       stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
+                           len);
+#endif /* STATS */
                } else {
                        TCPSTAT_INC(tcps_sndpack);
                        TCPSTAT_ADD(tcps_sndbyte, len);
+#ifdef STATS
+                       stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
+                           len);
+#endif /* STATS */
                }
 #ifdef INET6
                if (MHLEN < hdrlen + max_linkhdr)
@@ -1472,6 +1491,15 @@ out:
                                tp->t_rtseq = startseq;
                                TCPSTAT_INC(tcps_segstimed);
                        }
+#ifdef STATS
+                       if (!(tp->t_flags & TF_GPUTINPROG) && len) {
+                               tp->t_flags |= TF_GPUTINPROG;
+                               tp->gput_seq = startseq;
+                               tp->gput_ack = startseq +
+                                   ulmin(sbavail(&so->so_snd) - off, sendwin);
+                               tp->gput_ts = tcp_ts_getticks();
+                       }
+#endif /* STATS */
                }
 
                /*

Added: head/sys/netinet/tcp_stats.c
==============================================================================
--- /dev/null   00:00:00 1970   (empty, because file is newly added)
+++ head/sys/netinet/tcp_stats.c        Mon Dec  2 20:58:04 2019        
(r355304)
@@ -0,0 +1,274 @@
+/*-
+ * Copyright (c) 2016-2018 Netflix, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Author: Lawrence Stewart <[email protected]>
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/arb.h>
+#include <sys/errno.h>
+#include <sys/malloc.h>
+#include <sys/qmath.h>
+#include <sys/queue.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#ifdef _KERNEL
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/systm.h>
+#endif
+#include <sys/stats.h>
+
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_var.h>
+
+#include <netinet/cc/cc.h>
+
+VNET_DEFINE(int, tcp_perconn_stats_dflt_tpl) = -1;
+
+#ifndef _KERNEL
+#define        V_tcp_perconn_stats_enable      VNET(tcp_perconn_stats_enable)
+#define        V_tcp_perconn_stats_dflt_tpl    VNET(tcp_perconn_stats_dflt_tpl)
+#else /* _KERNEL */
+
+VNET_DEFINE(int, tcp_perconn_stats_enable) = 2;
+VNET_DEFINE_STATIC(struct stats_tpl_sample_rate *, 
tcp_perconn_stats_sample_rates);
+VNET_DEFINE_STATIC(int, tcp_stats_nrates) = 0;
+#define        V_tcp_perconn_stats_sample_rates 
VNET(tcp_perconn_stats_sample_rates)
+#define        V_tcp_stats_nrates              VNET(tcp_stats_nrates)
+
+static struct rmlock tcp_stats_tpl_sampling_lock;
+static int tcp_stats_tpl_sr_cb(enum stats_tpl_sr_cb_action action,
+    struct stats_tpl_sample_rate **rates, int *nrates, void *ctx);
+
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, perconn_stats_enable,
+    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_perconn_stats_enable), 0,
+    "Enable per-connection TCP stats gathering; 1 enables for all connections, 
"
+    "2 enables random sampling across log id connection groups");
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, perconn_stats_sample_rates,
+    CTLTYPE_STRING | CTLFLAG_RW, tcp_stats_tpl_sr_cb,
+    sizeof(struct rm_priotracker), stats_tpl_sample_rates, "A",
+    "TCP stats per template random sampling rates, in CSV tpl_spec=percent "
+    "key-value pairs (see stats(9) for template spec details)");
+#endif /* _KERNEL */
+
+#ifdef _KERNEL
+int
+#else
+static int
+/* Ensure all templates are also added to the userland template list. */
+__attribute__ ((constructor))
+#endif
+tcp_stats_init()
+{
+       int err, lasterr;
+
+       err = lasterr = 0;
+
+       V_tcp_perconn_stats_dflt_tpl = stats_tpl_alloc("TCP_DEFAULT", 0);
+       if (V_tcp_perconn_stats_dflt_tpl < 0)
+               return (-V_tcp_perconn_stats_dflt_tpl);
+
+       struct voistatspec vss_sum[] = {
+               STATS_VSS_SUM(),
+       };
+       err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
+           VOI_TCP_TXPB, "TCP_TXPB", VSD_DTYPE_INT_U64,
+           NVSS(vss_sum), vss_sum, 0);
+       lasterr = err ? err : lasterr;
+       err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
+           VOI_TCP_RETXPB, "TCP_RETXPB", VSD_DTYPE_INT_U32,
+           NVSS(vss_sum), vss_sum, 0);
+       lasterr = err ? err : lasterr;
+
+       struct voistatspec vss_max[] = {
+               STATS_VSS_MAX(),
+       };
+       err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
+           VOI_TCP_FRWIN, "TCP_FRWIN", VSD_DTYPE_INT_ULONG,
+           NVSS(vss_max), vss_max, 0);
+       lasterr = err ? err : lasterr;
+       err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
+           VOI_TCP_LCWIN, "TCP_LCWIN", VSD_DTYPE_INT_ULONG,
+           NVSS(vss_max), vss_max, 0);
+       lasterr = err ? err : lasterr;
+
+       struct voistatspec vss_rtt[] = {
+               STATS_VSS_MAX(),
+               STATS_VSS_MIN(),
+               STATS_VSS_TDGSTCLUST32(20, 4),
+       };
+       err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
+           VOI_TCP_RTT, "TCP_RTT", VSD_DTYPE_INT_U32,
+           NVSS(vss_rtt), vss_rtt, 0);
+       lasterr = err ? err : lasterr;
+
+       struct voistatspec vss_congsig[] = {
+               STATS_VSS_DVHIST32_USR(HBKTS(DVBKT(CC_ECN), DVBKT(CC_RTO),
+                   DVBKT(CC_RTO_ERR), DVBKT(CC_NDUPACK)), 0)
+       };
+       err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
+           VOI_TCP_CSIG, "TCP_CSIG", VSD_DTYPE_INT_U32,
+           NVSS(vss_congsig), vss_congsig, 0);
+       lasterr = err ? err : lasterr;
+
+       struct voistatspec vss_gput[] = {
+               STATS_VSS_MAX(),
+               STATS_VSS_TDGSTCLUST32(20, 4),
+       };
+       err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
+           VOI_TCP_GPUT, "TCP_GPUT", VSD_DTYPE_INT_U32,
+           NVSS(vss_gput), vss_gput, 0);
+       lasterr = err ? err : lasterr;
+
+       struct voistatspec vss_gput_nd[] = {
+               STATS_VSS_TDGSTCLUST32(10, 4),
+       };
+       err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
+           VOI_TCP_GPUT_ND, "TCP_GPUT_ND", VSD_DTYPE_INT_S32,
+           NVSS(vss_gput_nd), vss_gput_nd, 0);
+       lasterr = err ? err : lasterr;
+
+       struct voistatspec vss_windiff[] = {
+               STATS_VSS_CRHIST32_USR(HBKTS(CRBKT(0)), VSD_HIST_LBOUND_INF)
+       };
+       err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
+           VOI_TCP_CALCFRWINDIFF, "TCP_CALCFRWINDIFF", VSD_DTYPE_INT_S32,
+           NVSS(vss_windiff), vss_windiff, 0);
+       lasterr = err ? err : lasterr;
+
+       struct voistatspec vss_acklen[] = {
+               STATS_VSS_MAX(),
+               STATS_VSS_CRHIST32_LIN(0, 9, 1, VSD_HIST_UBOUND_INF)
+       };
+       err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
+           VOI_TCP_ACKLEN, "TCP_ACKLEN", VSD_DTYPE_INT_U32,
+           NVSS(vss_acklen), vss_acklen, 0);
+       lasterr = err ? err : lasterr;
+
+       return (lasterr);
+}
+
+#ifdef _KERNEL
+int
+tcp_stats_sample_rollthedice(struct tcpcb *tp, void *seed_bytes,
+    size_t seed_len)
+{
+       struct rm_priotracker tracker;
+       int tpl;
+
+       tpl = -1;
+
+       if (V_tcp_stats_nrates > 0) {
+               rm_rlock(&tcp_stats_tpl_sampling_lock, &tracker);
+               tpl = 
stats_tpl_sample_rollthedice(V_tcp_perconn_stats_sample_rates,
+                   V_tcp_stats_nrates, seed_bytes, seed_len);
+               rm_runlock(&tcp_stats_tpl_sampling_lock, &tracker);
+
+               if (tpl >= 0) {
+                       INP_WLOCK_ASSERT(tp->t_inpcb);
+                       if (tp->t_stats != NULL)
+                               stats_blob_destroy(tp->t_stats);
+                       tp->t_stats = stats_blob_alloc(tpl, 0);
+                       if (tp->t_stats == NULL)
+                               tpl = -ENOMEM;
+               }
+       }
+
+       return (tpl);
+}
+
+/*
+ * Callback function for stats_tpl_sample_rates() to interact with the TCP
+ * subsystem's stats template sample rates list.
+ */
+int
+tcp_stats_tpl_sr_cb(enum stats_tpl_sr_cb_action action,
+    struct stats_tpl_sample_rate **rates, int *nrates, void *ctx)
+{
+       struct stats_tpl_sample_rate *old_rates;
+       int old_nrates;
+
+       if (ctx == NULL)
+               return (ENOMEM);
+
+       switch (action) {
+       case TPL_SR_RLOCKED_GET:
+               /*
+                * Return with rlock held i.e. this call must be paired with a
+                * "action == TPL_SR_RUNLOCK" call.
+                */
+               rm_assert(&tcp_stats_tpl_sampling_lock, RA_UNLOCKED);
+               rm_rlock(&tcp_stats_tpl_sampling_lock,
+                   (struct rm_priotracker *)ctx);
+               /* FALLTHROUGH */
+       case TPL_SR_UNLOCKED_GET:
+               if (rates != NULL)
+                       *rates = V_tcp_perconn_stats_sample_rates;
+               if (nrates != NULL)
+                       *nrates = V_tcp_stats_nrates;
+               break;
+       case TPL_SR_RUNLOCK:
+               rm_assert(&tcp_stats_tpl_sampling_lock, RA_RLOCKED);
+               rm_runlock(&tcp_stats_tpl_sampling_lock,
+                   (struct rm_priotracker *)ctx);
+               break;
+       case TPL_SR_PUT:
+               KASSERT(rates != NULL && nrates != NULL,
+                   ("%s: PUT without new rates", __func__));
+               rm_assert(&tcp_stats_tpl_sampling_lock, RA_UNLOCKED);
+               if (rates == NULL || nrates == NULL)
+                       return (EINVAL);
+               rm_wlock(&tcp_stats_tpl_sampling_lock);
+               old_rates = V_tcp_perconn_stats_sample_rates;
+               old_nrates = V_tcp_stats_nrates;
+               V_tcp_perconn_stats_sample_rates = *rates;
+               V_tcp_stats_nrates = *nrates;
+               rm_wunlock(&tcp_stats_tpl_sampling_lock);
+               *rates = old_rates;
+               *nrates = old_nrates;
+               break;
+       default:
+               return (EINVAL);
+               break;
+       }
+
+       return (0);
+}
+
+RM_SYSINIT(tcp_stats_tpl_sampling_lock, &tcp_stats_tpl_sampling_lock,
+    "tcp_stats_tpl_sampling_lock");
+#endif /* _KERNEL */

Modified: head/sys/netinet/tcp_subr.c
==============================================================================
--- head/sys/netinet/tcp_subr.c Mon Dec  2 20:57:13 2019        (r355303)
+++ head/sys/netinet/tcp_subr.c Mon Dec  2 20:58:04 2019        (r355304)
@@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/arb.h>
 #include <sys/callout.h>
 #include <sys/eventhandler.h>
 #ifdef TCP_HHOOK
@@ -54,6 +55,8 @@ __FBSDID("$FreeBSD$");
 #ifdef KERN_TLS
 #include <sys/ktls.h>
 #endif
+#include <sys/qmath.h>
+#include <sys/stats.h>
 #include <sys/sysctl.h>
 #include <sys/jail.h>
 #include <sys/malloc.h>
@@ -1005,6 +1008,11 @@ tcp_init(void)
            &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 
0)
                printf("%s: WARNING: unable to register helper hook\n", 
__func__);
 #endif
+#ifdef STATS
+       if (tcp_stats_init())
+               printf("%s: WARNING: unable to initialise TCP stats\n",
+                   __func__);
+#endif
        hashsize = TCBHASHSIZE;
        TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize);
        if (hashsize == 0) {
@@ -1694,6 +1702,10 @@ tcp_newtcpcb(struct inpcb *inp)
        if (tp->t_fb->tfb_tcp_fb_init) {
                (*tp->t_fb->tfb_tcp_fb_init)(tp);
        }
+#ifdef STATS
+       if (V_tcp_perconn_stats_enable == 1)
+               tp->t_stats = stats_blob_alloc(V_tcp_perconn_stats_dflt_tpl, 0);
+#endif
        return (tp);            /* XXX */
 }
 
@@ -1911,6 +1923,9 @@ tcp_discardcb(struct tcpcb *tp)
 
 #ifdef TCP_HHOOK
        khelp_destroy_osd(tp->osd);
+#endif
+#ifdef STATS
+       stats_blob_destroy(tp->t_stats);
 #endif
 
        CC_ALGO(tp) = NULL;

Modified: head/sys/netinet/tcp_usrreq.c
==============================================================================
--- head/sys/netinet/tcp_usrreq.c       Mon Dec  2 20:57:13 2019        
(r355303)
+++ head/sys/netinet/tcp_usrreq.c       Mon Dec  2 20:58:04 2019        
(r355304)
@@ -49,11 +49,13 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/arb.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/refcount.h>
 #include <sys/kernel.h>
 #include <sys/ktls.h>
+#include <sys/qmath.h>
 #include <sys/sysctl.h>
 #include <sys/mbuf.h>
 #ifdef INET6
@@ -65,6 +67,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/proc.h>
 #include <sys/jail.h>
 #include <sys/syslog.h>
+#include <sys/stats.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
@@ -108,6 +111,13 @@ __FBSDID("$FreeBSD$");
 #endif
 #include <netipsec/ipsec_support.h>
 
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+
 /*
  * TCP protocol interface to socket abstraction.
  */
@@ -1816,6 +1826,9 @@ tcp_default_ctloutput(struct socket *so, struct sockop
 #endif
        struct cc_algo *algo;
        char    *pbuf, buf[TCP_LOG_ID_LEN];
+#ifdef STATS
+       struct statsblob *sbp;
+#endif
        size_t  len;
 
        /*
@@ -1933,6 +1946,35 @@ unlock_and_done:
                        error = EINVAL;
                        break;
 
+               case TCP_STATS:
+                       INP_WUNLOCK(inp);
+#ifdef STATS
+                       error = sooptcopyin(sopt, &optval, sizeof optval,
+                           sizeof optval);
+                       if (error)
+                               return (error);
+
+                       if (optval > 0)
+                               sbp = stats_blob_alloc(
+                                   V_tcp_perconn_stats_dflt_tpl, 0);
+                       else
+                               sbp = NULL;
+
+                       INP_WLOCK_RECHECK(inp);
+                       if ((tp->t_stats != NULL && sbp == NULL) ||
+                           (tp->t_stats == NULL && sbp != NULL)) {
+                               struct statsblob *t = tp->t_stats;
+                               tp->t_stats = sbp;
+                               sbp = t;
+                       }
+                       INP_WUNLOCK(inp);
+
+                       stats_blob_destroy(sbp);
+#else
+                       return (EOPNOTSUPP);
+#endif /* !STATS */
+                       break;
+
                case TCP_CONGESTION:
                        INP_WUNLOCK(inp);
                        error = sooptcopyin(sopt, buf, TCP_CA_NAME_MAX - 1, 1);
@@ -2217,6 +2259,55 @@ unlock_and_done:
                        INP_WUNLOCK(inp);
                        error = sooptcopyout(sopt, &ti, sizeof ti);
                        break;
+               case TCP_STATS:
+                       {
+#ifdef STATS
+                       int nheld;
+                       TYPEOF_MEMBER(struct statsblob, flags) sbflags = 0;
+
+                       error = 0;
+                       socklen_t outsbsz = sopt->sopt_valsize;
+                       if (tp->t_stats == NULL)
+                               error = ENOENT;
+                       else if (outsbsz >= tp->t_stats->cursz)
+                               outsbsz = tp->t_stats->cursz;
+                       else if (outsbsz >= sizeof(struct statsblob))
+                               outsbsz = sizeof(struct statsblob);
+                       else
+                               error = EINVAL;
+                       INP_WUNLOCK(inp);
+                       if (error)
+                               break;
+
+                       sbp = sopt->sopt_val;
+                       nheld = atop(round_page(((vm_offset_t)sbp) +
+                           (vm_size_t)outsbsz) - trunc_page((vm_offset_t)sbp));
+                       vm_page_t ma[nheld];
+                       if (vm_fault_quick_hold_pages(
+                           &curproc->p_vmspace->vm_map, (vm_offset_t)sbp,
+                           outsbsz, VM_PROT_READ | VM_PROT_WRITE, ma,
+                           nheld) < 0) {
+                               error = EFAULT;
+                               break;
+                       }
+
+                       if ((error = copyin_nofault(&(sbp->flags), &sbflags,
+                           SIZEOF_MEMBER(struct statsblob, flags))))
+                               goto unhold;
+
+                       INP_WLOCK_RECHECK(inp);
+                       error = stats_blob_snapshot(&sbp, outsbsz, tp->t_stats,
+                           sbflags | SB_CLONE_USRDSTNOFAULT);
+                       INP_WUNLOCK(inp);
+                       sopt->sopt_valsize = outsbsz;
+unhold:
+                       vm_page_unhold_pages(ma, nheld);
+#else
+                       INP_WUNLOCK(inp);
+                       error = EOPNOTSUPP;
+#endif /* !STATS */
+                       break;
+                       }
                case TCP_CONGESTION:
                        len = strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX);
                        INP_WUNLOCK(inp);

Modified: head/sys/netinet/tcp_var.h
==============================================================================
--- head/sys/netinet/tcp_var.h  Mon Dec  2 20:57:13 2019        (r355303)
+++ head/sys/netinet/tcp_var.h  Mon Dec  2 20:58:04 2019        (r355304)
@@ -210,7 +210,12 @@ struct tcpcb {
        struct tcp_log_id_node *t_lin;
        struct tcp_log_id_bucket *t_lib;
        const char *t_output_caller;    /* Function that called tcp_output */
+       struct statsblob *t_stats;      /* Per-connection stats */
        uint32_t t_logsn;               /* Log "serial number" */
+       uint32_t gput_ts;               /* Time goodput measurement started */
+       tcp_seq gput_seq;               /* Outbound measurement seq */
+       tcp_seq gput_ack;               /* Inbound measurement ack */
+       int32_t t_stats_gput_prev;      /* XXXLAS: Prev gput measurement */
        uint8_t t_tfo_client_cookie_len; /* TCP Fast Open client cookie length 
*/
        unsigned int *t_tfo_pending;    /* TCP Fast Open server pending counter 
*/
        union {
@@ -327,7 +332,7 @@ TAILQ_HEAD(tcp_funchead, tcp_function);
 #define        TF_NOPUSH       0x00001000      /* don't push */
 #define        TF_PREVVALID    0x00002000      /* saved values for bad rxmit 
valid */
 #define        TF_UNUSED1      0x00004000      /* unused */
-#define        TF_UNUSED2      0x00008000      /* unused */
+#define        TF_GPUTINPROG   0x00008000      /* Goodput measurement in 
progress */
 #define        TF_MORETOCOME   0x00010000      /* More data to be appended to 
sock */
 #define        TF_LQ_OVERFLOW  0x00020000      /* listen queue overflow */
 #define        TF_LASTIDLE     0x00040000      /* connection was previously 
idle */
@@ -787,6 +792,10 @@ VNET_DECLARE(int, tcp_insecure_rst);
 VNET_DECLARE(int, tcp_insecure_syn);
 VNET_DECLARE(int, tcp_minmss);
 VNET_DECLARE(int, tcp_mssdflt);
+#ifdef STATS
+VNET_DECLARE(int, tcp_perconn_stats_dflt_tpl);
+VNET_DECLARE(int, tcp_perconn_stats_enable);
+#endif /* STATS */
 VNET_DECLARE(int, tcp_recvspace);
 VNET_DECLARE(int, tcp_sack_globalholes);
 VNET_DECLARE(int, tcp_sack_globalmaxholes);
@@ -823,6 +832,10 @@ VNET_DECLARE(struct inpcbinfo, tcbinfo);
 #define        V_tcp_insecure_syn              VNET(tcp_insecure_syn)
 #define        V_tcp_minmss                    VNET(tcp_minmss)
 #define        V_tcp_mssdflt                   VNET(tcp_mssdflt)
+#ifdef STATS
+#define        V_tcp_perconn_stats_dflt_tpl    VNET(tcp_perconn_stats_dflt_tpl)
+#define        V_tcp_perconn_stats_enable      VNET(tcp_perconn_stats_enable)
+#endif /* STATS */
 #define        V_tcp_recvspace                 VNET(tcp_recvspace)
 #define        V_tcp_sack_globalholes          VNET(tcp_sack_globalholes)
 #define        V_tcp_sack_globalmaxholes       VNET(tcp_sack_globalmaxholes)
@@ -966,10 +979,13 @@ int        tcp_newreno(struct tcpcb *, struct tcphdr *);
 int     tcp_compute_pipe(struct tcpcb *);
 uint32_t tcp_compute_initwnd(uint32_t);
 void    tcp_sndbuf_autoscale(struct tcpcb *, struct socket *, uint32_t);
+int     tcp_stats_sample_rollthedice(struct tcpcb *tp, void *seed_bytes,
+    size_t seed_len);
 struct mbuf *
         tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen,
           int32_t seglimit, int32_t segsize, struct sockbuf *sb, bool hw_tls);
 
+int    tcp_stats_init(void);
 
 static inline void
 tcp_fields_to_host(struct tcphdr *th)

Modified: head/sys/sys/stats.h
==============================================================================
--- head/sys/sys/stats.h        Mon Dec  2 20:57:13 2019        (r355303)
+++ head/sys/sys/stats.h        Mon Dec  2 20:58:04 2019        (r355304)
@@ -58,6 +58,9 @@
 #define _SYS_STATS_H_
 
 #include <sys/limits.h>
+#ifdef DIAGNOSTIC
+#include <sys/tree.h>
+#endif
 
 #ifndef _KERNEL
 /*
_______________________________________________
[email protected] mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "[email protected]"

Reply via email to