On Fri, Oct 31, 2014 at 02:50:00PM +1000, David Gwynne wrote: > so without splicing, the payloads from multiple tcp packets (at least all of > the ones in a single softnet run?) get bundled up into a buffer that userland > reads and then writes out again in a single go. right? > > you're suggesting the taskq as a way to defer output till after the current > softnet call has processed all its packets and queued all the tcp packet > payloads onto the socket?
Exactly. > its worth remembering there are other memory costs too. i think a kthread > (the thing taskqs run on) is 5 pages amd64, so 20KB. We can delay the creation of the sosplice thread until user-land tries to splice for the first time. I would like to get in the sosplice pool in first. The user-land part was missing in my previous diff. Updated diff that actually can do make build. ok? bluhm Index: sys/kern/kern_sysctl.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_sysctl.c,v retrieving revision 1.267 diff -u -p -u -p -r1.267 kern_sysctl.c --- sys/kern/kern_sysctl.c 17 Oct 2014 01:51:39 -0000 1.267 +++ sys/kern/kern_sysctl.c 31 Oct 2014 10:23:44 -0000 @@ -1062,11 +1062,12 @@ fill_file(struct kinfo_file *kf, struct kf->so_family = so->so_proto->pr_domain->dom_family; kf->so_rcv_cc = so->so_rcv.sb_cc; kf->so_snd_cc = so->so_snd.sb_cc; - if (so->so_splice) { + if (isspliced(so)) { if (show_pointers) - kf->so_splice = PTRTOINT64(so->so_splice); - kf->so_splicelen = so->so_splicelen; - } else if (so->so_spliceback) + kf->so_splice = + PTRTOINT64(so->so_sp->ssp_socket); + kf->so_splicelen = so->so_sp->ssp_len; + } else if (issplicedback(so)) kf->so_splicelen = -1; if (!so->so_pcb) break; Index: sys/kern/uipc_socket.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket.c,v retrieving revision 1.133 diff -u -p -u -p -r1.133 uipc_socket.c --- sys/kern/uipc_socket.c 9 Sep 2014 02:07:17 -0000 1.133 +++ sys/kern/uipc_socket.c 31 Oct 2014 10:23:44 -0000 @@ -80,12 +80,19 @@ int somaxconn = SOMAXCONN; int sominconn = SOMINCONN; struct pool socket_pool; +#ifdef SOCKET_SPLICE +struct pool sosplice_pool; +#endif void soinit(void) { pool_init(&socket_pool, sizeof(struct socket), 0, 0, 0, "sockpl", NULL); +#ifdef SOCKET_SPLICE + pool_init(&sosplice_pool, sizeof(struct sosplice), 0, 0, 0, "sosppl", + NULL); +#endif } /* @@ -157,7 +164,7 @@ solisten(struct socket *so, int backlog) if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) return (EOPNOTSUPP); #ifdef SOCKET_SPLICE - if (so->so_splice || so->so_spliceback) + if (isspliced(so) || issplicedback(so)) return (EOPNOTSUPP); #endif /* SOCKET_SPLICE */ s = splsoftnet(); @@ -199,10 +206,15 @@ sofree(struct socket *so) return; } #ifdef SOCKET_SPLICE - if (so->so_spliceback) - sounsplice(so->so_spliceback, so, so->so_spliceback != so); - if (so->so_splice) - sounsplice(so, so->so_splice, 0); + if (so->so_sp) { + if (issplicedback(so)) + sounsplice(so->so_sp->ssp_soback, so, + so->so_sp->ssp_soback != so); + if (isspliced(so)) + sounsplice(so, so->so_sp->ssp_socket, 0); + pool_put(&sosplice_pool, so->so_sp); + so->so_sp = NULL; + } #endif /* SOCKET_SPLICE */ sbrelease(&so->so_snd); sorflush(so); @@ -647,7 +659,7 @@ restart: m = so->so_rcv.sb_mb; #ifdef SOCKET_SPLICE - if (so->so_splice) + if (isspliced(so)) m = NULL; #endif /* SOCKET_SPLICE */ /* @@ -669,7 +681,7 @@ restart: #ifdef DIAGNOSTIC if (m == NULL && so->so_rcv.sb_cc) #ifdef SOCKET_SPLICE - if (so->so_splice == NULL) + if (!isspliced(so)) #endif /* SOCKET_SPLICE */ panic("receive 1"); #endif @@ -1021,6 +1033,12 @@ sorflush(struct socket *so) } #ifdef SOCKET_SPLICE + +#define so_splicelen so_sp->ssp_len +#define so_splicemax so_sp->ssp_max +#define so_idletv so_sp->ssp_idletv +#define so_idleto so_sp->ssp_idleto + int sosplice(struct socket *so, int fd, off_t max, struct timeval *tv) { @@ -1035,6 +1053,8 @@ sosplice(struct socket *so, int fd, off_ if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && (so->so_proto->pr_flags & PR_CONNREQUIRED)) return (ENOTCONN); + if (so->so_sp == NULL) + so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); /* If no fd is given, unsplice by removing existing link. */ if (fd < 0) { @@ -1043,8 +1063,8 @@ sosplice(struct socket *so, int fd, off_ (so->so_state & SS_NBIO) ? M_NOWAIT : M_WAITOK)) != 0) return (error); s = splsoftnet(); - if (so->so_splice) - sounsplice(so, so->so_splice, 1); + if (so->so_sp->ssp_socket) + sounsplice(so, so->so_sp->ssp_socket, 1); splx(s); sbunlock(&so->so_rcv); return (0); @@ -1060,6 +1080,8 @@ sosplice(struct socket *so, int fd, off_ if ((error = getsock(curproc->p_fd, fd, &fp)) != 0) return (error); sosp = fp->f_data; + if (sosp->so_sp == NULL) + sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO); /* Lock both receive and send buffer. */ if ((error = sblock(&so->so_rcv, @@ -1074,7 +1096,7 @@ sosplice(struct socket *so, int fd, off_ } s = splsoftnet(); - if (so->so_splice || sosp->so_spliceback) { + if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) { error = EBUSY; goto release; } @@ -1092,8 +1114,8 @@ sosplice(struct socket *so, int fd, off_ } /* Splice so and sosp together. */ - so->so_splice = sosp; - sosp->so_spliceback = so; + so->so_sp->ssp_socket = sosp; + sosp->so_sp->ssp_soback = so; so->so_splicelen = 0; so->so_splicemax = max; if (tv) @@ -1127,7 +1149,7 @@ sounsplice(struct socket *so, struct soc timeout_del(&so->so_idleto); sosp->so_snd.sb_flagsintr &= ~SB_SPLICE; so->so_rcv.sb_flagsintr &= ~SB_SPLICE; - so->so_splice = sosp->so_spliceback = NULL; + so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL; if (wakeup && soreadable(so)) sorwakeup(so); } @@ -1139,9 +1161,9 @@ soidle(void *arg) int s; s = splsoftnet(); - if (so->so_splice) { + if (so->so_rcv.sb_flagsintr & SB_SPLICE) { so->so_error = ETIMEDOUT; - sounsplice(so, so->so_splice, 1); + sounsplice(so, so->so_sp->ssp_socket, 1); } splx(s); } @@ -1155,7 +1177,7 @@ soidle(void *arg) int somove(struct socket *so, int wait) { - struct socket *sosp = so->so_splice; + struct socket *sosp = so->so_sp->ssp_socket; struct mbuf *m, **mp, *nextrecord; u_long len, off, oobmark; long space; @@ -1408,6 +1430,12 @@ somove(struct socket *so, int wait) timeout_add_tv(&so->so_idleto, &so->so_idletv); return (1); } + +#undef so_splicelen +#undef so_splicemax +#undef so_idletv +#undef so_idleto + #endif /* SOCKET_SPLICE */ void @@ -1416,7 +1444,7 @@ sorwakeup(struct socket *so) #ifdef SOCKET_SPLICE if (so->so_rcv.sb_flagsintr & SB_SPLICE) (void) somove(so, M_DONTWAIT); - if (so->so_splice) + if (isspliced(so)) return; #endif sowakeup(so, &so->so_rcv); @@ -1429,7 +1457,7 @@ sowwakeup(struct socket *so) { #ifdef SOCKET_SPLICE if (so->so_snd.sb_flagsintr & SB_SPLICE) - (void) somove(so->so_spliceback, M_DONTWAIT); + (void) somove(so->so_sp->ssp_soback, M_DONTWAIT); #endif sowakeup(so, &so->so_snd); } @@ -1722,11 +1750,12 @@ sogetopt(struct socket *so, int level, i #ifdef SOCKET_SPLICE case SO_SPLICE: { + off_t len; int s = splsoftnet(); m->m_len = sizeof(off_t); - memcpy(mtod(m, off_t *), &so->so_splicelen, - sizeof(off_t)); + len = so->so_sp ? so->so_sp->ssp_len : 0; + memcpy(mtod(m, off_t *), &len, sizeof(off_t)); splx(s); break; } @@ -1815,7 +1844,7 @@ filt_soread(struct knote *kn, long hint) kn->kn_data = so->so_rcv.sb_cc; #ifdef SOCKET_SPLICE - if (so->so_splice) + if (isspliced(so)) return (0); #endif /* SOCKET_SPLICE */ if (so->so_state & SS_CANTRCVMORE) { Index: sys/sys/socketvar.h =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/sys/socketvar.h,v retrieving revision 1.56 diff -u -p -u -p -r1.56 socketvar.h --- sys/sys/socketvar.h 9 Sep 2014 02:07:17 -0000 1.56 +++ sys/sys/socketvar.h 31 Oct 2014 10:23:44 -0000 @@ -81,13 +81,17 @@ struct socket { uid_t so_siguid; /* uid of process who set so_pgid */ uid_t so_sigeuid; /* euid of process who set so_pgid */ u_long so_oobmark; /* chars to oob mark */ - - struct socket *so_splice; /* send data to drain socket */ - struct socket *so_spliceback; /* back ref for notify and cleanup */ - off_t so_splicelen; /* number of bytes spliced so far */ - off_t so_splicemax; /* maximum number of bytes to splice */ - struct timeval so_idletv; /* idle timeout */ - struct timeout so_idleto; +/* + * Variables for socket splicing, allocated only when needed. + */ + struct sosplice { + struct socket *ssp_socket; /* send data to drain socket */ + struct socket *ssp_soback; /* back ref to source socket */ + off_t ssp_len; /* number of bytes spliced */ + off_t ssp_max; /* maximum number of bytes */ + struct timeval ssp_idletv; /* idle timeout */ + struct timeout ssp_idleto; + } *so_sp; /* * Variables for socket buffering. */ @@ -148,6 +152,9 @@ struct socket { * Macros for sockets and socket buffering. */ +#define isspliced(so) ((so)->so_sp && (so)->so_sp->ssp_socket) +#define issplicedback(so) ((so)->so_sp && (so)->so_sp->ssp_soback) + /* * Do we need to notify the other side when I/O is possible? */ @@ -173,7 +180,7 @@ struct socket { /* can we read something from so? */ #define soreadable(so) \ - ((so)->so_splice == NULL && \ + (!isspliced(so) && \ ((so)->so_rcv.sb_cc >= (so)->so_rcv.sb_lowat || \ ((so)->so_state & SS_CANTRCVMORE) || \ (so)->so_qlen || (so)->so_error)) Index: lib/libkvm/kvm_file2.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/lib/libkvm/kvm_file2.c,v retrieving revision 1.38 diff -u -p -u -p -r1.38 kvm_file2.c --- lib/libkvm/kvm_file2.c 25 Oct 2014 03:18:58 -0000 1.38 +++ lib/libkvm/kvm_file2.c 31 Oct 2014 11:40:05 -0000 @@ -542,6 +542,7 @@ fill_file(kvm_t *kd, struct kinfo_file * case DTYPE_SOCKET: { struct socket sock; + struct sosplice ssp; struct protosw protosw; struct domain domain; @@ -565,11 +566,18 @@ fill_file(kvm_t *kd, struct kinfo_file * kf->so_family = domain.dom_family; kf->so_rcv_cc = sock.so_rcv.sb_cc; kf->so_snd_cc = sock.so_snd.sb_cc; - if (sock.so_splice) { - kf->so_splice = PTRTOINT64(sock.so_splice); - kf->so_splicelen = sock.so_splicelen; - } else if (sock.so_spliceback) - kf->so_splicelen = -1; + if (sock.so_sp) { + if (KREAD(kd, (u_long)sock.so_sp, &ssp)) { + _kvm_err(kd, kd->program, "can't read splice"); + return (-1); + } + if (ssp.ssp_socket) { + kf->so_splice = PTRTOINT64(ssp.ssp_socket); + kf->so_splicelen = ssp.ssp_len; + } else if (ssp.ssp_soback) { + kf->so_splicelen = -1; + } + } if (!sock.so_pcb) break; switch (kf->so_family) { Index: usr.bin/netstat/inet.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/usr.bin/netstat/inet.c,v retrieving revision 1.136 diff -u -p -u -p -r1.136 inet.c --- usr.bin/netstat/inet.c 26 Oct 2014 14:43:03 -0000 1.136 +++ usr.bin/netstat/inet.c 31 Oct 2014 11:00:17 -0000 @@ -91,6 +91,7 @@ char *inetname(struct in_addr *); void inetprint(struct in_addr *, in_port_t, char *, int); char *inet6name(struct in6_addr *); void inet6print(struct in6_addr *, int, char *); +void sosplice_dump(u_long); void sockbuf_dump(struct sockbuf *, const char *); void protosw_dump(u_long, u_long); void domain_dump(u_long, u_long, short); @@ -1166,7 +1167,6 @@ socket_dump(u_long off) kread(off, &so, sizeof(so)); #define p(fmt, v, sep) printf(#v " " fmt sep, so.v); -#define pll(fmt, v, sep) printf(#v " " fmt sep, (long long) so.v); #define pp(fmt, v, sep) printf(#v " " fmt sep, hideroot ? 0 : so.v); printf("socket %#lx\n ", hideroot ? 0 : off); p("%#.4x", so_type, "\n "); @@ -1185,12 +1185,8 @@ socket_dump(u_long off) p("%u", so_siguid, ", "); p("%u", so_sigeuid, "\n "); p("%lu", so_oobmark, "\n "); - pp("%p", so_splice, ", "); - pp("%p", so_spliceback, "\n "); - p("%lld", so_splicelen, ", "); - p("%lld", so_splicemax, ", "); - pll("%lld", so_idletv.tv_sec, ", "); - p("%ld", so_idletv.tv_usec, "\n "); + if (so.so_sp) + sosplice_dump((u_long)so.so_sp); sockbuf_dump(&so.so_rcv, "so_rcv"); sockbuf_dump(&so.so_snd, "so_snd"); p("%u", so_euid, ", "); @@ -1204,6 +1200,32 @@ socket_dump(u_long off) if (!vflag) return; protosw_dump((u_long)so.so_proto, (u_long)so.so_pcb); +} + +/* + * Dump the contents of a struct sosplice + */ +void +sosplice_dump(u_long off) +{ + struct sosplice ssp; + + if (off == 0) + return; + kread(off, &ssp, sizeof(ssp)); + +#define p(fmt, v, sep) printf(#v " " fmt sep, ssp.v); +#define pll(fmt, v, sep) printf(#v " " fmt sep, (long long) ssp.v); +#define pp(fmt, v, sep) printf(#v " " fmt sep, hideroot ? 0 : ssp.v); + pp("%p", ssp_socket, ", "); + pp("%p", ssp_soback, "\n "); + p("%lld", ssp_len, ", "); + p("%lld", ssp_max, ", "); + pll("%lld", ssp_idletv.tv_sec, ", "); + p("%ld", ssp_idletv.tv_usec, "\n "); +#undef p +#undef pll +#undef pp } /*