> On 31 Oct 2014, at 22:37, Alexander Bluhm <[email protected]> wrote:
>
> On Fri, Oct 31, 2014 at 02:50:00PM +1000, David Gwynne wrote:
>> so without splicing, the payloads from multiple tcp packets (at least all of
>> the ones in a single softnet run?) get bundled up into a buffer that
>> userland reads and then writes out again in a single go. right?
>>
>> you're suggesting the taskq as a way to defer output till after the current
>> softnet call has processed all its packets and queued all the tcp packet
>> payloads onto the socket?
>
> Exactly.
>
>> its worth remembering there are other memory costs too. i think a kthread
>> (the thing taskqs run on) is 5 pages amd64, so 20KB.
>
> We can delay the creation of the sosplice thread until user-land
> tries to splice for the first time.
>
> I would like to get in the sosplice pool in first. The user-land
> part was missing in my previous diff. Updated diff that actually
> can do make build.
>
> ok?
the pool change seems fine.
>
> bluhm
>
> Index: sys/kern/kern_sysctl.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_sysctl.c,v
> retrieving revision 1.267
> diff -u -p -u -p -r1.267 kern_sysctl.c
> --- sys/kern/kern_sysctl.c 17 Oct 2014 01:51:39 -0000 1.267
> +++ sys/kern/kern_sysctl.c 31 Oct 2014 10:23:44 -0000
> @@ -1062,11 +1062,12 @@ fill_file(struct kinfo_file *kf, struct
> kf->so_family = so->so_proto->pr_domain->dom_family;
> kf->so_rcv_cc = so->so_rcv.sb_cc;
> kf->so_snd_cc = so->so_snd.sb_cc;
> - if (so->so_splice) {
> + if (isspliced(so)) {
> if (show_pointers)
> - kf->so_splice = PTRTOINT64(so->so_splice);
> - kf->so_splicelen = so->so_splicelen;
> - } else if (so->so_spliceback)
> + kf->so_splice =
> + PTRTOINT64(so->so_sp->ssp_socket);
> + kf->so_splicelen = so->so_sp->ssp_len;
> + } else if (issplicedback(so))
> kf->so_splicelen = -1;
> if (!so->so_pcb)
> break;
> Index: sys/kern/uipc_socket.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket.c,v
> retrieving revision 1.133
> diff -u -p -u -p -r1.133 uipc_socket.c
> --- sys/kern/uipc_socket.c 9 Sep 2014 02:07:17 -0000 1.133
> +++ sys/kern/uipc_socket.c 31 Oct 2014 10:23:44 -0000
> @@ -80,12 +80,19 @@ int somaxconn = SOMAXCONN;
> int sominconn = SOMINCONN;
>
> struct pool socket_pool;
> +#ifdef SOCKET_SPLICE
> +struct pool sosplice_pool;
> +#endif
>
> void
> soinit(void)
> {
>
> pool_init(&socket_pool, sizeof(struct socket), 0, 0, 0, "sockpl", NULL);
> +#ifdef SOCKET_SPLICE
> + pool_init(&sosplice_pool, sizeof(struct sosplice), 0, 0, 0, "sosppl",
> + NULL);
> +#endif
> }
>
> /*
> @@ -157,7 +164,7 @@ solisten(struct socket *so, int backlog)
> if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
> return (EOPNOTSUPP);
> #ifdef SOCKET_SPLICE
> - if (so->so_splice || so->so_spliceback)
> + if (isspliced(so) || issplicedback(so))
> return (EOPNOTSUPP);
> #endif /* SOCKET_SPLICE */
> s = splsoftnet();
> @@ -199,10 +206,15 @@ sofree(struct socket *so)
> return;
> }
> #ifdef SOCKET_SPLICE
> - if (so->so_spliceback)
> - sounsplice(so->so_spliceback, so, so->so_spliceback != so);
> - if (so->so_splice)
> - sounsplice(so, so->so_splice, 0);
> + if (so->so_sp) {
> + if (issplicedback(so))
> + sounsplice(so->so_sp->ssp_soback, so,
> + so->so_sp->ssp_soback != so);
> + if (isspliced(so))
> + sounsplice(so, so->so_sp->ssp_socket, 0);
> + pool_put(&sosplice_pool, so->so_sp);
> + so->so_sp = NULL;
> + }
> #endif /* SOCKET_SPLICE */
> sbrelease(&so->so_snd);
> sorflush(so);
> @@ -647,7 +659,7 @@ restart:
>
> m = so->so_rcv.sb_mb;
> #ifdef SOCKET_SPLICE
> - if (so->so_splice)
> + if (isspliced(so))
> m = NULL;
> #endif /* SOCKET_SPLICE */
> /*
> @@ -669,7 +681,7 @@ restart:
> #ifdef DIAGNOSTIC
> if (m == NULL && so->so_rcv.sb_cc)
> #ifdef SOCKET_SPLICE
> - if (so->so_splice == NULL)
> + if (!isspliced(so))
> #endif /* SOCKET_SPLICE */
> panic("receive 1");
> #endif
> @@ -1021,6 +1033,12 @@ sorflush(struct socket *so)
> }
>
> #ifdef SOCKET_SPLICE
> +
> +#define so_splicelen so_sp->ssp_len
> +#define so_splicemax so_sp->ssp_max
> +#define so_idletv so_sp->ssp_idletv
> +#define so_idleto so_sp->ssp_idleto
> +
> int
> sosplice(struct socket *so, int fd, off_t max, struct timeval *tv)
> {
> @@ -1035,6 +1053,8 @@ sosplice(struct socket *so, int fd, off_
> if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
> (so->so_proto->pr_flags & PR_CONNREQUIRED))
> return (ENOTCONN);
> + if (so->so_sp == NULL)
> + so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
>
> /* If no fd is given, unsplice by removing existing link. */
> if (fd < 0) {
> @@ -1043,8 +1063,8 @@ sosplice(struct socket *so, int fd, off_
> (so->so_state & SS_NBIO) ? M_NOWAIT : M_WAITOK)) != 0)
> return (error);
> s = splsoftnet();
> - if (so->so_splice)
> - sounsplice(so, so->so_splice, 1);
> + if (so->so_sp->ssp_socket)
> + sounsplice(so, so->so_sp->ssp_socket, 1);
> splx(s);
> sbunlock(&so->so_rcv);
> return (0);
> @@ -1060,6 +1080,8 @@ sosplice(struct socket *so, int fd, off_
> if ((error = getsock(curproc->p_fd, fd, &fp)) != 0)
> return (error);
> sosp = fp->f_data;
> + if (sosp->so_sp == NULL)
> + sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
>
> /* Lock both receive and send buffer. */
> if ((error = sblock(&so->so_rcv,
> @@ -1074,7 +1096,7 @@ sosplice(struct socket *so, int fd, off_
> }
> s = splsoftnet();
>
> - if (so->so_splice || sosp->so_spliceback) {
> + if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) {
> error = EBUSY;
> goto release;
> }
> @@ -1092,8 +1114,8 @@ sosplice(struct socket *so, int fd, off_
> }
>
> /* Splice so and sosp together. */
> - so->so_splice = sosp;
> - sosp->so_spliceback = so;
> + so->so_sp->ssp_socket = sosp;
> + sosp->so_sp->ssp_soback = so;
> so->so_splicelen = 0;
> so->so_splicemax = max;
> if (tv)
> @@ -1127,7 +1149,7 @@ sounsplice(struct socket *so, struct soc
> timeout_del(&so->so_idleto);
> sosp->so_snd.sb_flagsintr &= ~SB_SPLICE;
> so->so_rcv.sb_flagsintr &= ~SB_SPLICE;
> - so->so_splice = sosp->so_spliceback = NULL;
> + so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL;
> if (wakeup && soreadable(so))
> sorwakeup(so);
> }
> @@ -1139,9 +1161,9 @@ soidle(void *arg)
> int s;
>
> s = splsoftnet();
> - if (so->so_splice) {
> + if (so->so_rcv.sb_flagsintr & SB_SPLICE) {
> so->so_error = ETIMEDOUT;
> - sounsplice(so, so->so_splice, 1);
> + sounsplice(so, so->so_sp->ssp_socket, 1);
> }
> splx(s);
> }
> @@ -1155,7 +1177,7 @@ soidle(void *arg)
> int
> somove(struct socket *so, int wait)
> {
> - struct socket *sosp = so->so_splice;
> + struct socket *sosp = so->so_sp->ssp_socket;
> struct mbuf *m, **mp, *nextrecord;
> u_long len, off, oobmark;
> long space;
> @@ -1408,6 +1430,12 @@ somove(struct socket *so, int wait)
> timeout_add_tv(&so->so_idleto, &so->so_idletv);
> return (1);
> }
> +
> +#undef so_splicelen
> +#undef so_splicemax
> +#undef so_idletv
> +#undef so_idleto
> +
> #endif /* SOCKET_SPLICE */
>
> void
> @@ -1416,7 +1444,7 @@ sorwakeup(struct socket *so)
> #ifdef SOCKET_SPLICE
> if (so->so_rcv.sb_flagsintr & SB_SPLICE)
> (void) somove(so, M_DONTWAIT);
> - if (so->so_splice)
> + if (isspliced(so))
> return;
> #endif
> sowakeup(so, &so->so_rcv);
> @@ -1429,7 +1457,7 @@ sowwakeup(struct socket *so)
> {
> #ifdef SOCKET_SPLICE
> if (so->so_snd.sb_flagsintr & SB_SPLICE)
> - (void) somove(so->so_spliceback, M_DONTWAIT);
> + (void) somove(so->so_sp->ssp_soback, M_DONTWAIT);
> #endif
> sowakeup(so, &so->so_snd);
> }
> @@ -1722,11 +1750,12 @@ sogetopt(struct socket *so, int level, i
> #ifdef SOCKET_SPLICE
> case SO_SPLICE:
> {
> + off_t len;
> int s = splsoftnet();
>
> m->m_len = sizeof(off_t);
> - memcpy(mtod(m, off_t *), &so->so_splicelen,
> - sizeof(off_t));
> + len = so->so_sp ? so->so_sp->ssp_len : 0;
> + memcpy(mtod(m, off_t *), &len, sizeof(off_t));
> splx(s);
> break;
> }
> @@ -1815,7 +1844,7 @@ filt_soread(struct knote *kn, long hint)
>
> kn->kn_data = so->so_rcv.sb_cc;
> #ifdef SOCKET_SPLICE
> - if (so->so_splice)
> + if (isspliced(so))
> return (0);
> #endif /* SOCKET_SPLICE */
> if (so->so_state & SS_CANTRCVMORE) {
> Index: sys/sys/socketvar.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/sys/socketvar.h,v
> retrieving revision 1.56
> diff -u -p -u -p -r1.56 socketvar.h
> --- sys/sys/socketvar.h 9 Sep 2014 02:07:17 -0000 1.56
> +++ sys/sys/socketvar.h 31 Oct 2014 10:23:44 -0000
> @@ -81,13 +81,17 @@ struct socket {
> uid_t so_siguid; /* uid of process who set so_pgid */
> uid_t so_sigeuid; /* euid of process who set so_pgid */
> u_long so_oobmark; /* chars to oob mark */
> -
> - struct socket *so_splice; /* send data to drain socket */
> - struct socket *so_spliceback; /* back ref for notify and cleanup */
> - off_t so_splicelen; /* number of bytes spliced so far */
> - off_t so_splicemax; /* maximum number of bytes to splice */
> - struct timeval so_idletv; /* idle timeout */
> - struct timeout so_idleto;
> +/*
> + * Variables for socket splicing, allocated only when needed.
> + */
> + struct sosplice {
> + struct socket *ssp_socket; /* send data to drain socket */
> + struct socket *ssp_soback; /* back ref to source socket */
> + off_t ssp_len; /* number of bytes spliced */
> + off_t ssp_max; /* maximum number of bytes */
> + struct timeval ssp_idletv; /* idle timeout */
> + struct timeout ssp_idleto;
> + } *so_sp;
> /*
> * Variables for socket buffering.
> */
> @@ -148,6 +152,9 @@ struct socket {
> * Macros for sockets and socket buffering.
> */
>
> +#define isspliced(so) ((so)->so_sp && (so)->so_sp->ssp_socket)
> +#define issplicedback(so) ((so)->so_sp && (so)->so_sp->ssp_soback)
> +
> /*
> * Do we need to notify the other side when I/O is possible?
> */
> @@ -173,7 +180,7 @@ struct socket {
>
> /* can we read something from so? */
> #define soreadable(so) \
> - ((so)->so_splice == NULL && \
> + (!isspliced(so) && \
> ((so)->so_rcv.sb_cc >= (so)->so_rcv.sb_lowat || \
> ((so)->so_state & SS_CANTRCVMORE) || \
> (so)->so_qlen || (so)->so_error))
> Index: lib/libkvm/kvm_file2.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/lib/libkvm/kvm_file2.c,v
> retrieving revision 1.38
> diff -u -p -u -p -r1.38 kvm_file2.c
> --- lib/libkvm/kvm_file2.c 25 Oct 2014 03:18:58 -0000 1.38
> +++ lib/libkvm/kvm_file2.c 31 Oct 2014 11:40:05 -0000
> @@ -542,6 +542,7 @@ fill_file(kvm_t *kd, struct kinfo_file *
>
> case DTYPE_SOCKET: {
> struct socket sock;
> + struct sosplice ssp;
> struct protosw protosw;
> struct domain domain;
>
> @@ -565,11 +566,18 @@ fill_file(kvm_t *kd, struct kinfo_file *
> kf->so_family = domain.dom_family;
> kf->so_rcv_cc = sock.so_rcv.sb_cc;
> kf->so_snd_cc = sock.so_snd.sb_cc;
> - if (sock.so_splice) {
> - kf->so_splice = PTRTOINT64(sock.so_splice);
> - kf->so_splicelen = sock.so_splicelen;
> - } else if (sock.so_spliceback)
> - kf->so_splicelen = -1;
> + if (sock.so_sp) {
> + if (KREAD(kd, (u_long)sock.so_sp, &ssp)) {
> + _kvm_err(kd, kd->program, "can't read splice");
> + return (-1);
> + }
> + if (ssp.ssp_socket) {
> + kf->so_splice = PTRTOINT64(ssp.ssp_socket);
> + kf->so_splicelen = ssp.ssp_len;
> + } else if (ssp.ssp_soback) {
> + kf->so_splicelen = -1;
> + }
> + }
> if (!sock.so_pcb)
> break;
> switch (kf->so_family) {
> Index: usr.bin/netstat/inet.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/usr.bin/netstat/inet.c,v
> retrieving revision 1.136
> diff -u -p -u -p -r1.136 inet.c
> --- usr.bin/netstat/inet.c 26 Oct 2014 14:43:03 -0000 1.136
> +++ usr.bin/netstat/inet.c 31 Oct 2014 11:00:17 -0000
> @@ -91,6 +91,7 @@ char *inetname(struct in_addr *);
> void inetprint(struct in_addr *, in_port_t, char *, int);
> char *inet6name(struct in6_addr *);
> void inet6print(struct in6_addr *, int, char *);
> +void sosplice_dump(u_long);
> void sockbuf_dump(struct sockbuf *, const char *);
> void protosw_dump(u_long, u_long);
> void domain_dump(u_long, u_long, short);
> @@ -1166,7 +1167,6 @@ socket_dump(u_long off)
> kread(off, &so, sizeof(so));
>
> #define p(fmt, v, sep) printf(#v " " fmt sep, so.v);
> -#define pll(fmt, v, sep) printf(#v " " fmt sep, (long long) so.v);
> #define pp(fmt, v, sep) printf(#v " " fmt sep, hideroot ? 0 : so.v);
> printf("socket %#lx\n ", hideroot ? 0 : off);
> p("%#.4x", so_type, "\n ");
> @@ -1185,12 +1185,8 @@ socket_dump(u_long off)
> p("%u", so_siguid, ", ");
> p("%u", so_sigeuid, "\n ");
> p("%lu", so_oobmark, "\n ");
> - pp("%p", so_splice, ", ");
> - pp("%p", so_spliceback, "\n ");
> - p("%lld", so_splicelen, ", ");
> - p("%lld", so_splicemax, ", ");
> - pll("%lld", so_idletv.tv_sec, ", ");
> - p("%ld", so_idletv.tv_usec, "\n ");
> + if (so.so_sp)
> + sosplice_dump((u_long)so.so_sp);
> sockbuf_dump(&so.so_rcv, "so_rcv");
> sockbuf_dump(&so.so_snd, "so_snd");
> p("%u", so_euid, ", ");
> @@ -1204,6 +1200,32 @@ socket_dump(u_long off)
> if (!vflag)
> return;
> protosw_dump((u_long)so.so_proto, (u_long)so.so_pcb);
> +}
> +
> +/*
> + * Dump the contents of a struct sosplice
> + */
> +void
> +sosplice_dump(u_long off)
> +{
> + struct sosplice ssp;
> +
> + if (off == 0)
> + return;
> + kread(off, &ssp, sizeof(ssp));
> +
> +#define p(fmt, v, sep) printf(#v " " fmt sep, ssp.v);
> +#define pll(fmt, v, sep) printf(#v " " fmt sep, (long long) ssp.v);
> +#define pp(fmt, v, sep) printf(#v " " fmt sep, hideroot ? 0 : ssp.v);
> + pp("%p", ssp_socket, ", ");
> + pp("%p", ssp_soback, "\n ");
> + p("%lld", ssp_len, ", ");
> + p("%lld", ssp_max, ", ");
> + pll("%lld", ssp_idletv.tv_sec, ", ");
> + p("%ld", ssp_idletv.tv_usec, "\n ");
> +#undef p
> +#undef pll
> +#undef pp
> }
>
> /*