I sent this message to dlg@ directly to discuss my modification of his diff to make the bigger mbuf clusters work. i got no response so far, thats why i decided to post it on tech@ directly. Maybe this way i get faster some feedback :)
BR Simon ### Original Mail: ---------- Forwarded message ---------- From: Simon Mages <mages.si...@googlemail.com> Date: Fri, 22 Jul 2016 13:24:24 +0200 Subject: Re: [PATCH] let the mbufs use more then 4gb of memory To: David Gwynne <da...@gwynne.id.au> Hi, I think i found the problem with your diff regarding the bigger mbuf clusters. You choose a buffer size based on space and resid, but what happens when resid is larger then space and space is for example 2050? The cluster choosen has then the size 4096. But this size is to large for the socket buffer. In the past this was never a problem because you only allocated external clusters of size MCLBYTES and this was only done when space was larger then MCLBYTES. diff: Index: kern/uipc_socket.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket.c,v retrieving revision 1.152 diff -u -p -u -p -r1.152 uipc_socket.c --- kern/uipc_socket.c 13 Jun 2016 21:24:43 -0000 1.152 +++ kern/uipc_socket.c 22 Jul 2016 10:56:02 -0000 @@ -496,15 +496,18 @@ restart: mlen = MLEN; } if (resid >= MINCLSIZE && space >= MCLBYTES) { - MCLGET(m, M_NOWAIT); + MCLGETI(m, M_NOWAIT, NULL, lmin(resid, + lmin(space, MAXMCLBYTES))); if ((m->m_flags & M_EXT) == 0) goto nopages; if (atomic && top == 0) { - len = ulmin(MCLBYTES - max_hdr, - resid); + len = lmin(lmin(resid, space), + m->m_ext.ext_size - + max_hdr); m->m_data += max_hdr; } else - len = ulmin(MCLBYTES, resid); + len = lmin(lmin(resid, space), + m->m_ext.ext_size); space -= len; } else { nopages: Im using this diff no for a while on my notebook and everything works as expected. But i had no time to realy test it or test the performance. This will be my next step. I reproduced the unix socket problem you mentioned with the following little programm: #include <stdlib.h> #include <stdio.h> #include <string.h> #include <err.h> #include <fcntl.h> #include <poll.h> #include <unistd.h> #include <sys/socket.h> #include <sys/stat.h> #include <sys/wait.h> #define FILE "/tmp/afile" int senddesc(int fd, int so); int recvdesc(int so); int main(void) { struct stat sb; int sockpair[2]; pid_t pid = 0; int status; int newfile; if (unlink(FILE) < 0) warn("unlink: %s", FILE); int file = open(FILE, O_RDWR|O_CREAT|O_TRUNC); if (socketpair(AF_UNIX, SOCK_STREAM|SOCK_NONBLOCK, 0, sockpair) < 0) err(1, "socketpair"); if ((pid =fork())) { senddesc(file, sockpair[0]); if (waitpid(pid, &status, 0) < 0) err(1, "waitpid"); } else { newfile = recvdesc(sockpair[1]); if (fstat(newfile, &sb) < 0) err(1, "fstat"); } return 0; } int senddesc(int fd, int so) { struct msghdr msg; struct cmsghdr *cmsg; union { struct cmsghdr hdr; unsigned char buf[CMSG_SPACE(sizeof(int))]; } cmsgbuf; char *cbuf = calloc(6392, sizeof(char)); memset(cbuf, 'K', 6392); struct iovec iov = { .iov_base = cbuf, .iov_len = 6392, }; memset(&msg, 0, sizeof(struct msghdr)); msg.msg_iov = &iov; msg.msg_iovlen = 1; msg.msg_control = &cmsgbuf.buf; msg.msg_controllen = sizeof(cmsgbuf.buf); cmsg = CMSG_FIRSTHDR(&msg); cmsg->cmsg_len = CMSG_LEN(sizeof(int)); cmsg->cmsg_level = SOL_SOCKET; cmsg->cmsg_type = SCM_RIGHTS; *(int *)CMSG_DATA(cmsg) = fd; struct pollfd pfd[1]; int nready; int wrote = 0; int wrote_total = 0; pfd[0].fd = so; pfd[0].events = POLLOUT; while (1) { nready = poll(pfd, 1, -1); if (nready == -1) err(1, "poll"); if ((pfd[0].revents & (POLLERR|POLLNVAL))) errx(1, "bad fd %d", pfd[0].fd); if ((pfd[0].revents & (POLLOUT|POLLHUP))) { if ((wrote = sendmsg(so, &msg, 0)) < 0) err(1, "sendmsg"); } wrote_total += wrote; iov.iov_len -= wrote * sizeof(char); if (iov.iov_len <= 0) { printf("send all data: %d byte\n", wrote_total); break; } } return 0; } int recvdesc(int so) { int fd = -1; struct msghdr msg; struct cmsghdr *cmsg; union { struct cmsghdr hdr; unsigned char buf[CMSG_SPACE(sizeof(int))]; } cmsgbuf; struct iovec iov; iov.iov_base = calloc(6392, sizeof(char)); iov.iov_len = 6392 * sizeof(char); memset(&msg, 0, sizeof(struct msghdr)); msg.msg_control = &cmsgbuf.buf; msg.msg_controllen = sizeof(cmsgbuf.buf); msg.msg_iov = &iov; msg.msg_iovlen = 1; struct pollfd pfd[1]; int nready; int read_data = 0; int total_read_data = 0; pfd[0].fd = so; pfd[0].events = POLLIN; while (1) { nready = poll(pfd, 1, -1); if (nready == -1) err(1, "poll"); if ((pfd[0].revents & (POLLERR|POLLNVAL))) errx(1, "bad fd %d", pfd[0].fd); if ((pfd[0].revents & (POLLIN|POLLHUP))) { if ((read_data = recvmsg(so, &msg, 0)) < 0) err(1, "recvmsg"); } total_read_data += read_data; iov.iov_len -= read_data * sizeof(char); if (iov.iov_len <= 0) { printf("received all data: %d byte\n", total_read_data); break; } } if ((msg.msg_flags & MSG_CTRUNC)) errx(1, "control message truncated"); if ((msg.msg_flags & MSG_TRUNC)) errx(1, "message truncated"); for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL; cmsg = CMSG_NXTHDR(&msg, cmsg)) { if (cmsg->cmsg_len == CMSG_LEN(sizeof(int)) && cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) { fd = *(int *)CMSG_DATA(cmsg); } } return fd; } Without the fix the following happens. MCLGETI will choose a cluster which is to large for the receive buffer of the second domain socket and will then also move to much data. Then it goes down the pr_usrreq path, in case of a unix socket uipc_usrreq. There it will then use sbappendcontrol to move the data into the socket buffer, which is not possible, so it just returns zero. In that case the sender thinks he send all the data but the receiver gets nothing, or another part which was small enough later on. There is also no error signaling there, so what happens is that the mbufs just leak. Maybe this should be handled in a different way, but i did not realy think about that so far. BR Simon 2016-07-01 2:45 GMT+02:00, David Gwynne <da...@gwynne.id.au>: > >> On 1 Jul 2016, at 04:44, Simon Mages <mages.si...@googlemail.com> wrote: >> >> Do you remember what the problem was you encounter with ospfd and your >> kern_socket diff? > > the unix socket pair between two of the processes closed unexpectedly or had > the wrong amount of data available. it was too long ago for me to recall > correctly :( > >> >> I'm not a user of ospf :) > > thats fine. id hope more of the multiprocess daemons like ntpd and smtpd > would exhibit the behaviour too. > > do you have a way of testing performance of sockets? > > dlg > >> >> 2016-06-22 14:54 GMT+02:00, David Gwynne <da...@gwynne.id.au>: >>> On Wed, Jun 22, 2016 at 01:58:25PM +0200, Simon Mages wrote: >>>> On a System where you use the maximum socketbuffer size of 256kbyte you >>>> can run out of memory after less then 9k open sockets. >>>> >>>> My patch adds a new uvm_constraint for the mbufs with a bigger memory >>>> area. >>>> I choose this area after reading the comments in >>>> sys/arch/amd64/include/pmap.h. >>>> This patch further changes the maximum sucketbuffer size from 256k to >>>> 1gb >>>> as >>>> it is described in the rfc1323 S2.3. >>>> >>>> I tested this diff with the ix, em and urndis driver. I know that this >>>> diff only works >>>> for amd64 right now, but i wanted to send this diff as a proposal what >>>> could be >>>> done. Maybe somebody has a different solution for this Problem or can >>>> me >>>> why >>>> this is a bad idea. >>> >>> hey simon, >>> >>> first, some background. >>> >>> the 4G watermark is less about limiting the amount of memory used >>> by the network stack and more about making the memory addressable >>> by as many devices, including network cards, as possible. we support >>> older chips that only deal with 32 bit addresses (and one or two >>> stupid ones with an inability to address over 1G), so we took the >>> conservative option and made made the memory generally usable without >>> developers having to think about it much. >>> >>> you could argue that if you should be able to give big addresses >>> to modern cards, but that falls down if you are forwarding packets >>> between a modern and old card, cos the old card will want to dma >>> the packet the modern card rxed, but it needs it below the 4g line. >>> even if you dont have an old card, in todays hotplug world you might >>> plug an old device in. either way, the future of an mbuf is very >>> hard for the kernel to predict. >>> >>> secondly, allocating more than 4g at a time to socket buffers is >>> generally a waste of memory. in practice you should scale the amount >>> of memory available to sockets according to the size of the tcp >>> windows you need to saturate the bandwidth available to the box. >>> this means if you want to sustain a gigabit of traffic with a 300ms >>> round trip time for packets, you'd "only" need ~37.5 megabytes of >>> buffers. to sustain 40 gigabit you'd need 1.5 gigabytes, which is >>> still below 4G. allowing more use of memory for buffers would likely >>> induce latency. >>> >>> the above means that if you want to sustain a single 40G tcp >>> connection to that host you'd need to be able to place 1.5G on the >>> socket buffer, which is above the 1G you mention above. however, >>> if you want to sustain 2 connections, you ideally want to fairly >>> share the 1.5G between both sockets. they should get 750M each. >>> >>> fairly sharing buffers between the sockets may already be in place >>> in openbsd. when i reworked the pools subsystem i set it up so >>> things sleeping on memory were woken up in order. >>> >>> it occurs to me that perhaps we should limit mbufs by the bytes >>> they can use rather than the number of them. that would also work >>> well if we moved to per cpu caches for mbufs and clusters, cos the >>> number of active mbufs in the system becomes hard to limit accurately >>> if we want cpus to run independently. >>> >>> if you want something to work on in this area, could you look at >>> letting sockets use the "jumbo" clusters instead of assuming >>> everything has to be in 2k clusters? i started on thsi with the >>> diff below, but it broke ospfd and i never got back to it. >>> >>> if you get it working, it would be interested to test creating even >>> bigger cluster pools, eg, a 1M or 4M mbuf cluster. >>> >>> cheers, >>> dlg >>> >>> Index: uipc_socket.c >>> =================================================================== >>> RCS file: /cvs/src/sys/kern/uipc_socket.c,v >>> retrieving revision 1.135 >>> diff -u -p -r1.135 uipc_socket.c >>> --- uipc_socket.c 11 Dec 2014 19:21:57 -0000 1.135 >>> +++ uipc_socket.c 22 Dec 2014 01:11:03 -0000 >>> @@ -493,15 +493,18 @@ restart: >>> mlen = MLEN; >>> } >>> if (resid >= MINCLSIZE && space >= MCLBYTES) { >>> - MCLGET(m, M_NOWAIT); >>> + MCLGETI(m, M_NOWAIT, NULL, lmin(resid, >>> + lmin(space, MAXMCLBYTES))); >>> if ((m->m_flags & M_EXT) == 0) >>> goto nopages; >>> if (atomic && top == 0) { >>> - len = lmin(MCLBYTES - max_hdr, >>> - resid); >>> + len = lmin(resid, >>> + m->m_ext.ext_size - >>> + max_hdr); >>> m->m_data += max_hdr; >>> } else >>> - len = lmin(MCLBYTES, resid); >>> + len = lmin(resid, >>> + m->m_ext.ext_size); >>> space -= len; >>> } else { >>> nopages: >>> > >