Re: [PATCH] let the mbufs use more then 4gb of memory

David Gwynne Thu, 11 Aug 2016 23:40:20 -0700

> On 1 Aug 2016, at 21:07, Simon Mages <mages.si...@googlemail.com> wrote:
> 
> I sent this message to dlg@ directly to discuss my modification of his
> diff to make the
> bigger mbuf clusters work. i got no response so far, thats why i
> decided to post it on tech@
> directly. Maybe this way i get faster some feedback :)


hey simon,

i was travelling when you sent your mail to me and then it fell out of my head. 
sorry about that.

if this is working correctly then i would like to put it in the tree. from the 
light testing i have done, it is working correctly. would anyone object?

some performance measurement would also be interesting :)

dlg

> 
> BR
> Simon
> 
> ### Original Mail:
> 
> ---------- Forwarded message ----------
> From: Simon Mages <mages.si...@googlemail.com>
> Date: Fri, 22 Jul 2016 13:24:24 +0200
> Subject: Re: [PATCH] let the mbufs use more then 4gb of memory
> To: David Gwynne <da...@gwynne.id.au>
> 
> Hi,
> 
> I think i found the problem with your diff regarding the bigger mbuf clusters.
> 
> You choose a buffer size based on space and resid, but what happens when resid
> is larger then space and space is for example 2050? The cluster choosen has
> then the size 4096. But this size is to large for the socket buffer. In the
> past this was never a problem because you only allocated external clusters
> of size MCLBYTES and this was only done when space was larger then MCLBYTES.
> 
> diff:
> Index: kern/uipc_socket.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> retrieving revision 1.152
> diff -u -p -u -p -r1.152 uipc_socket.c
> --- kern/uipc_socket.c        13 Jun 2016 21:24:43 -0000      1.152
> +++ kern/uipc_socket.c        22 Jul 2016 10:56:02 -0000
> @@ -496,15 +496,18 @@ restart:
>                                       mlen = MLEN;
>                               }
>                               if (resid >= MINCLSIZE && space >= MCLBYTES) {
> -                                     MCLGET(m, M_NOWAIT);
> +                                     MCLGETI(m, M_NOWAIT, NULL, lmin(resid,
> +                                             lmin(space, MAXMCLBYTES)));
>                                       if ((m->m_flags & M_EXT) == 0)
>                                               goto nopages;
>                                       if (atomic && top == 0) {
> -                                             len = ulmin(MCLBYTES - max_hdr,
> -                                                 resid);
> +                                             len = lmin(lmin(resid, space),
> +                                                 m->m_ext.ext_size -
> +                                                 max_hdr);
>                                               m->m_data += max_hdr;
>                                       } else
> -                                             len = ulmin(MCLBYTES, resid);
> +                                             len = lmin(lmin(resid, space),
> +                                                 m->m_ext.ext_size);
>                                       space -= len;
>                               } else {
> nopages:
> 
> Im using this diff no for a while on my notebook and everything works as
> expected. But i had no time to realy test it or test the performance. This 
> will
> be my next step.
> 
> I reproduced the unix socket problem you mentioned with the following little
> programm:
> 
> #include <stdlib.h>
> #include <stdio.h>
> #include <string.h>
> #include <err.h>
> #include <fcntl.h>
> #include <poll.h>
> #include <unistd.h>
> 
> #include <sys/socket.h>
> #include <sys/stat.h>
> #include <sys/wait.h>
> 
> #define FILE "/tmp/afile"
> 
> int senddesc(int fd, int so);
> int recvdesc(int so);
> 
> int
> main(void)
> {
>       struct stat sb;
>       int sockpair[2];
>       pid_t pid = 0;
>       int status;
>       int newfile;
> 
>       if (unlink(FILE) < 0)
>               warn("unlink: %s", FILE);
> 
>       int file = open(FILE, O_RDWR|O_CREAT|O_TRUNC);
> 
>       if (socketpair(AF_UNIX, SOCK_STREAM|SOCK_NONBLOCK, 0, sockpair) < 0)
>               err(1, "socketpair");
> 
>       if ((pid =fork())) {
>               senddesc(file, sockpair[0]);
>               if (waitpid(pid, &status, 0) < 0)
>                       err(1, "waitpid");
>       } else {
>               newfile = recvdesc(sockpair[1]);
>               if (fstat(newfile, &sb) < 0)
>                       err(1, "fstat");
>       }
> 
>       return 0;
> }
> 
> int
> senddesc(int fd, int so)
> {
>       struct msghdr msg;
>       struct cmsghdr *cmsg;
>       union {
>               struct cmsghdr  hdr;
>               unsigned char   buf[CMSG_SPACE(sizeof(int))];
>       } cmsgbuf;
> 
>       char *cbuf = calloc(6392, sizeof(char));
>       memset(cbuf, 'K', 6392);
>       struct iovec iov = {
>               .iov_base = cbuf,
>               .iov_len = 6392,
>       };
> 
>       memset(&msg, 0, sizeof(struct msghdr));
>       msg.msg_iov = &iov;
>       msg.msg_iovlen = 1;
>       msg.msg_control = &cmsgbuf.buf;
>       msg.msg_controllen = sizeof(cmsgbuf.buf);
> 
>       cmsg = CMSG_FIRSTHDR(&msg);
>       cmsg->cmsg_len = CMSG_LEN(sizeof(int));
>       cmsg->cmsg_level = SOL_SOCKET;
>       cmsg->cmsg_type = SCM_RIGHTS;
>       *(int *)CMSG_DATA(cmsg) = fd;
> 
>       struct pollfd pfd[1];
>       int nready;
>       int wrote = 0;
>       int wrote_total = 0;
>       pfd[0].fd = so;
>       pfd[0].events = POLLOUT;
> 
>       while (1) {
>               nready = poll(pfd, 1, -1);
>               if (nready == -1)
>                       err(1, "poll");
>               if ((pfd[0].revents & (POLLERR|POLLNVAL)))
>                       errx(1, "bad fd %d", pfd[0].fd);
>               if ((pfd[0].revents & (POLLOUT|POLLHUP))) {
>                       if ((wrote = sendmsg(so, &msg, 0)) < 0)
>                               err(1, "sendmsg");
>               }
>               wrote_total += wrote;
>               iov.iov_len -= wrote * sizeof(char);
>               if (iov.iov_len <= 0) {
>                       printf("send all data: %d byte\n", wrote_total);
>                       break;
>               }
>       }
>       return 0;
> }
> 
> int
> recvdesc(int so)
> {
>       int fd = -1;
> 
>       struct msghdr    msg;
>       struct cmsghdr  *cmsg;
>       union {
>               struct cmsghdr hdr;
>               unsigned char    buf[CMSG_SPACE(sizeof(int))];
>       } cmsgbuf;
>       struct iovec iov;
>       iov.iov_base = calloc(6392, sizeof(char));
>       iov.iov_len = 6392 * sizeof(char);
> 
>       memset(&msg, 0, sizeof(struct msghdr));
>       msg.msg_control = &cmsgbuf.buf;
>       msg.msg_controllen = sizeof(cmsgbuf.buf);
>       msg.msg_iov = &iov;
>       msg.msg_iovlen = 1;
> 
>       struct pollfd pfd[1];
>       int nready;
>       int read_data = 0;
>       int total_read_data = 0;
>       pfd[0].fd = so;
>       pfd[0].events = POLLIN;
> 
>       while (1) {
>               nready = poll(pfd, 1, -1);
>               if (nready == -1)
>                       err(1, "poll");
>               if ((pfd[0].revents & (POLLERR|POLLNVAL)))
>                       errx(1, "bad fd %d", pfd[0].fd);
>               if ((pfd[0].revents & (POLLIN|POLLHUP))) {
>                       if ((read_data = recvmsg(so, &msg, 0)) < 0)
>                               err(1, "recvmsg");
>               }
>               total_read_data += read_data;
>               iov.iov_len -= read_data * sizeof(char);
>               if (iov.iov_len <= 0) {
>                       printf("received all data: %d byte\n", total_read_data);
>                       break;
>               }
>       }
> 
>       if ((msg.msg_flags & MSG_CTRUNC))
>               errx(1, "control message truncated");
> 
>       if ((msg.msg_flags & MSG_TRUNC))
>               errx(1, "message truncated");
> 
>       for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL;
>            cmsg = CMSG_NXTHDR(&msg, cmsg)) {
>               if (cmsg->cmsg_len == CMSG_LEN(sizeof(int)) &&
>                   cmsg->cmsg_level == SOL_SOCKET &&
>                   cmsg->cmsg_type == SCM_RIGHTS) {
>                       fd = *(int *)CMSG_DATA(cmsg);
>               }
>       }
> 
>       return fd;
> }
> 
> Without the fix the following happens. MCLGETI will choose a cluster which is
> to large for the receive buffer of the second domain socket and will then also
> move to much data. Then it goes down the pr_usrreq path, in case of a unix
> socket uipc_usrreq. There it will then use sbappendcontrol to move the data
> into the socket buffer, which is not possible, so it just returns zero. In 
> that
> case the sender thinks he send all the data but the receiver gets nothing, or
> another part which was small enough later on. There is also no error signaling
> there, so what happens is that the mbufs just leak. Maybe this should be
> handled in a different way, but i did not realy think about that so far.
> 
> BR
> 
> Simon
> 
> 
> 2016-07-01 2:45 GMT+02:00, David Gwynne <da...@gwynne.id.au>:
>> 
>>> On 1 Jul 2016, at 04:44, Simon Mages <mages.si...@googlemail.com> wrote:
>>> 
>>> Do you remember what the problem was you encounter with ospfd and your
>>> kern_socket diff?
>> 
>> the unix socket pair between two of the processes closed unexpectedly or had
>> the wrong amount of data available. it was too long ago for me to recall
>> correctly :(
>> 
>>> 
>>> I'm not a user of ospf :)
>> 
>> thats fine. id hope more of the multiprocess daemons like ntpd and smtpd
>> would exhibit the behaviour too.
>> 
>> do you have a way of testing performance of sockets?
>> 
>> dlg
>> 
>>> 
>>> 2016-06-22 14:54 GMT+02:00, David Gwynne <da...@gwynne.id.au>:
>>>> On Wed, Jun 22, 2016 at 01:58:25PM +0200, Simon Mages wrote:
>>>>> On a System where you use the maximum socketbuffer size of 256kbyte you
>>>>> can run out of memory after less then 9k open sockets.
>>>>> 
>>>>> My patch adds a new uvm_constraint for the mbufs with a bigger memory
>>>>> area.
>>>>> I choose this area after reading the comments in
>>>>> sys/arch/amd64/include/pmap.h.
>>>>> This patch further changes the maximum sucketbuffer size from 256k to
>>>>> 1gb
>>>>> as
>>>>> it is described in the rfc1323 S2.3.
>>>>> 
>>>>> I tested this diff with the ix, em and urndis driver. I know that this
>>>>> diff only works
>>>>> for amd64 right now, but i wanted to send this diff as a proposal what
>>>>> could be
>>>>> done. Maybe somebody has a different solution for this Problem or can
>>>>> me
>>>>> why
>>>>> this is a bad idea.
>>>> 
>>>> hey simon,
>>>> 
>>>> first, some background.
>>>> 
>>>> the 4G watermark is less about limiting the amount of memory used
>>>> by the network stack and more about making the memory addressable
>>>> by as many devices, including network cards, as possible. we support
>>>> older chips that only deal with 32 bit addresses (and one or two
>>>> stupid ones with an inability to address over 1G), so we took the
>>>> conservative option and made made the memory generally usable without
>>>> developers having to think about it much.
>>>> 
>>>> you could argue that if you should be able to give big addresses
>>>> to modern cards, but that falls down if you are forwarding packets
>>>> between a modern and old card, cos the old card will want to dma
>>>> the packet the modern card rxed, but it needs it below the 4g line.
>>>> even if you dont have an old card, in todays hotplug world you might
>>>> plug an old device in. either way, the future of an mbuf is very
>>>> hard for the kernel to predict.
>>>> 
>>>> secondly, allocating more than 4g at a time to socket buffers is
>>>> generally a waste of memory. in practice you should scale the amount
>>>> of memory available to sockets according to the size of the tcp
>>>> windows you need to saturate the bandwidth available to the box.
>>>> this means if you want to sustain a gigabit of traffic with a 300ms
>>>> round trip time for packets, you'd "only" need ~37.5 megabytes of
>>>> buffers. to sustain 40 gigabit you'd need 1.5 gigabytes, which is
>>>> still below 4G. allowing more use of memory for buffers would likely
>>>> induce latency.
>>>> 
>>>> the above means that if you want to sustain a single 40G tcp
>>>> connection to that host you'd need to be able to place 1.5G on the
>>>> socket buffer, which is above the 1G you mention above. however,
>>>> if you want to sustain 2 connections, you ideally want to fairly
>>>> share the 1.5G between both sockets. they should get 750M each.
>>>> 
>>>> fairly sharing buffers between the sockets may already be in place
>>>> in openbsd. when i reworked the pools subsystem i set it up so
>>>> things sleeping on memory were woken up in order.
>>>> 
>>>> it occurs to me that perhaps we should limit mbufs by the bytes
>>>> they can use rather than the number of them. that would also work
>>>> well if we moved to per cpu caches for mbufs and clusters, cos the
>>>> number of active mbufs in the system becomes hard to limit accurately
>>>> if we want cpus to run independently.
>>>> 
>>>> if you want something to work on in this area, could you look at
>>>> letting sockets use the "jumbo" clusters instead of assuming
>>>> everything has to be in 2k clusters? i started on thsi with the
>>>> diff below, but it broke ospfd and i never got back to it.
>>>> 
>>>> if you get it working, it would be interested to test creating even
>>>> bigger cluster pools, eg, a 1M or 4M mbuf cluster.
>>>> 
>>>> cheers,
>>>> dlg
>>>> 
>>>> Index: uipc_socket.c
>>>> ===================================================================
>>>> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
>>>> retrieving revision 1.135
>>>> diff -u -p -r1.135 uipc_socket.c
>>>> --- uipc_socket.c  11 Dec 2014 19:21:57 -0000      1.135
>>>> +++ uipc_socket.c  22 Dec 2014 01:11:03 -0000
>>>> @@ -493,15 +493,18 @@ restart:
>>>>                                    mlen = MLEN;
>>>>                            }
>>>>                            if (resid >= MINCLSIZE && space >= MCLBYTES) {
>>>> -                                  MCLGET(m, M_NOWAIT);
>>>> +                                  MCLGETI(m, M_NOWAIT, NULL, lmin(resid,
>>>> +                                      lmin(space, MAXMCLBYTES)));
>>>>                                    if ((m->m_flags & M_EXT) == 0)
>>>>                                            goto nopages;
>>>>                                    if (atomic && top == 0) {
>>>> -                                          len = lmin(MCLBYTES - max_hdr,
>>>> -                                              resid);
>>>> +                                          len = lmin(resid,
>>>> +                                              m->m_ext.ext_size -
>>>> +                                              max_hdr);
>>>>                                            m->m_data += max_hdr;
>>>>                                    } else
>>>> -                                          len = lmin(MCLBYTES, resid);
>>>> +                                          len = lmin(resid,
>>>> +                                              m->m_ext.ext_size);
>>>>                                    space -= len;
>>>>                            } else {
>>>> nopages:
>>>> 
>> 
>>

Re: [PATCH] let the mbufs use more then 4gb of memory

Reply via email to