Re: [PATCH] let the mbufs use more then 4gb of memory

Mark Kettenis Fri, 12 Aug 2016 02:59:03 -0700

> From: David Gwynne <da...@gwynne.id.au>
> Date: Fri, 12 Aug 2016 16:38:45 +1000
> 
> > On 1 Aug 2016, at 21:07, Simon Mages <mages.si...@googlemail.com> wrote:
> > 
> > I sent this message to dlg@ directly to discuss my modification of his
> > diff to make the
> > bigger mbuf clusters work. i got no response so far, thats why i
> > decided to post it on tech@
> > directly. Maybe this way i get faster some feedback :)
> 
> hey simon,
> 
> i was travelling when you sent your mail to me and then it fell out
> of my head. sorry about that.
> 
> if this is working correctly then i would like to put it in the tree. from 
> the light testing i have done, it is working correctly. would anyone object?
> 
> some performance measurement would also be interesting :)


Hmm, during debugging I've relied on the fact that only drivers
allocate the larger mbuf clusters for their rx rings.

Anyway, shouldn't the diff be using ulmin()?


> dlg
> 
> > 
> > BR
> > Simon
> > 
> > ### Original Mail:
> > 
> > ---------- Forwarded message ----------
> > From: Simon Mages <mages.si...@googlemail.com>
> > Date: Fri, 22 Jul 2016 13:24:24 +0200
> > Subject: Re: [PATCH] let the mbufs use more then 4gb of memory
> > To: David Gwynne <da...@gwynne.id.au>
> > 
> > Hi,
> > 
> > I think i found the problem with your diff regarding the bigger mbuf 
> > clusters.
> > 
> > You choose a buffer size based on space and resid, but what happens when 
> > resid
> > is larger then space and space is for example 2050? The cluster choosen has
> > then the size 4096. But this size is to large for the socket buffer. In the
> > past this was never a problem because you only allocated external clusters
> > of size MCLBYTES and this was only done when space was larger then MCLBYTES.
> > 
> > diff:
> > Index: kern/uipc_socket.c
> > ===================================================================
> > RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> > retrieving revision 1.152
> > diff -u -p -u -p -r1.152 uipc_socket.c
> > --- kern/uipc_socket.c      13 Jun 2016 21:24:43 -0000      1.152
> > +++ kern/uipc_socket.c      22 Jul 2016 10:56:02 -0000
> > @@ -496,15 +496,18 @@ restart:
> >                                     mlen = MLEN;
> >                             }
> >                             if (resid >= MINCLSIZE && space >= MCLBYTES) {
> > -                                   MCLGET(m, M_NOWAIT);
> > +                                   MCLGETI(m, M_NOWAIT, NULL, lmin(resid,
> > +                                           lmin(space, MAXMCLBYTES)));
> >                                     if ((m->m_flags & M_EXT) == 0)
> >                                             goto nopages;
> >                                     if (atomic && top == 0) {
> > -                                           len = ulmin(MCLBYTES - max_hdr,
> > -                                               resid);
> > +                                           len = lmin(lmin(resid, space),
> > +                                               m->m_ext.ext_size -
> > +                                               max_hdr);
> >                                             m->m_data += max_hdr;
> >                                     } else
> > -                                           len = ulmin(MCLBYTES, resid);
> > +                                           len = lmin(lmin(resid, space),
> > +                                               m->m_ext.ext_size);
> >                                     space -= len;
> >                             } else {
> > nopages:
> > 
> > Im using this diff no for a while on my notebook and everything works as
> > expected. But i had no time to realy test it or test the performance. This 
> > will
> > be my next step.
> > 
> > I reproduced the unix socket problem you mentioned with the following little
> > programm:
> > 
> > #include <stdlib.h>
> > #include <stdio.h>
> > #include <string.h>
> > #include <err.h>
> > #include <fcntl.h>
> > #include <poll.h>
> > #include <unistd.h>
> > 
> > #include <sys/socket.h>
> > #include <sys/stat.h>
> > #include <sys/wait.h>
> > 
> > #define FILE "/tmp/afile"
> > 
> > int senddesc(int fd, int so);
> > int recvdesc(int so);
> > 
> > int
> > main(void)
> > {
> >     struct stat sb;
> >     int sockpair[2];
> >     pid_t pid = 0;
> >     int status;
> >     int newfile;
> > 
> >     if (unlink(FILE) < 0)
> >             warn("unlink: %s", FILE);
> > 
> >     int file = open(FILE, O_RDWR|O_CREAT|O_TRUNC);
> > 
> >     if (socketpair(AF_UNIX, SOCK_STREAM|SOCK_NONBLOCK, 0, sockpair) < 0)
> >             err(1, "socketpair");
> > 
> >     if ((pid =fork())) {
> >             senddesc(file, sockpair[0]);
> >             if (waitpid(pid, &status, 0) < 0)
> >                     err(1, "waitpid");
> >     } else {
> >             newfile = recvdesc(sockpair[1]);
> >             if (fstat(newfile, &sb) < 0)
> >                     err(1, "fstat");
> >     }
> > 
> >     return 0;
> > }
> > 
> > int
> > senddesc(int fd, int so)
> > {
> >     struct msghdr msg;
> >     struct cmsghdr *cmsg;
> >     union {
> >             struct cmsghdr  hdr;
> >             unsigned char   buf[CMSG_SPACE(sizeof(int))];
> >     } cmsgbuf;
> > 
> >     char *cbuf = calloc(6392, sizeof(char));
> >     memset(cbuf, 'K', 6392);
> >     struct iovec iov = {
> >             .iov_base = cbuf,
> >             .iov_len = 6392,
> >     };
> > 
> >     memset(&msg, 0, sizeof(struct msghdr));
> >     msg.msg_iov = &iov;
> >     msg.msg_iovlen = 1;
> >     msg.msg_control = &cmsgbuf.buf;
> >     msg.msg_controllen = sizeof(cmsgbuf.buf);
> > 
> >     cmsg = CMSG_FIRSTHDR(&msg);
> >     cmsg->cmsg_len = CMSG_LEN(sizeof(int));
> >     cmsg->cmsg_level = SOL_SOCKET;
> >     cmsg->cmsg_type = SCM_RIGHTS;
> >     *(int *)CMSG_DATA(cmsg) = fd;
> > 
> >     struct pollfd pfd[1];
> >     int nready;
> >     int wrote = 0;
> >     int wrote_total = 0;
> >     pfd[0].fd = so;
> >     pfd[0].events = POLLOUT;
> > 
> >     while (1) {
> >             nready = poll(pfd, 1, -1);
> >             if (nready == -1)
> >                     err(1, "poll");
> >             if ((pfd[0].revents & (POLLERR|POLLNVAL)))
> >                     errx(1, "bad fd %d", pfd[0].fd);
> >             if ((pfd[0].revents & (POLLOUT|POLLHUP))) {
> >                     if ((wrote = sendmsg(so, &msg, 0)) < 0)
> >                             err(1, "sendmsg");
> >             }
> >             wrote_total += wrote;
> >             iov.iov_len -= wrote * sizeof(char);
> >             if (iov.iov_len <= 0) {
> >                     printf("send all data: %d byte\n", wrote_total);
> >                     break;
> >             }
> >     }
> >     return 0;
> > }
> > 
> > int
> > recvdesc(int so)
> > {
> >     int fd = -1;
> > 
> >     struct msghdr    msg;
> >     struct cmsghdr  *cmsg;
> >     union {
> >             struct cmsghdr hdr;
> >             unsigned char    buf[CMSG_SPACE(sizeof(int))];
> >     } cmsgbuf;
> >     struct iovec iov;
> >     iov.iov_base = calloc(6392, sizeof(char));
> >     iov.iov_len = 6392 * sizeof(char);
> > 
> >     memset(&msg, 0, sizeof(struct msghdr));
> >     msg.msg_control = &cmsgbuf.buf;
> >     msg.msg_controllen = sizeof(cmsgbuf.buf);
> >     msg.msg_iov = &iov;
> >     msg.msg_iovlen = 1;
> > 
> >     struct pollfd pfd[1];
> >     int nready;
> >     int read_data = 0;
> >     int total_read_data = 0;
> >     pfd[0].fd = so;
> >     pfd[0].events = POLLIN;
> > 
> >     while (1) {
> >             nready = poll(pfd, 1, -1);
> >             if (nready == -1)
> >                     err(1, "poll");
> >             if ((pfd[0].revents & (POLLERR|POLLNVAL)))
> >                     errx(1, "bad fd %d", pfd[0].fd);
> >             if ((pfd[0].revents & (POLLIN|POLLHUP))) {
> >                     if ((read_data = recvmsg(so, &msg, 0)) < 0)
> >                             err(1, "recvmsg");
> >             }
> >             total_read_data += read_data;
> >             iov.iov_len -= read_data * sizeof(char);
> >             if (iov.iov_len <= 0) {
> >                     printf("received all data: %d byte\n", total_read_data);
> >                     break;
> >             }
> >     }
> > 
> >     if ((msg.msg_flags & MSG_CTRUNC))
> >             errx(1, "control message truncated");
> > 
> >     if ((msg.msg_flags & MSG_TRUNC))
> >             errx(1, "message truncated");
> > 
> >     for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL;
> >          cmsg = CMSG_NXTHDR(&msg, cmsg)) {
> >             if (cmsg->cmsg_len == CMSG_LEN(sizeof(int)) &&
> >                 cmsg->cmsg_level == SOL_SOCKET &&
> >                 cmsg->cmsg_type == SCM_RIGHTS) {
> >                     fd = *(int *)CMSG_DATA(cmsg);
> >             }
> >     }
> > 
> >     return fd;
> > }
> > 
> > Without the fix the following happens. MCLGETI will choose a cluster which 
> > is
> > to large for the receive buffer of the second domain socket and will then 
> > also
> > move to much data. Then it goes down the pr_usrreq path, in case of a unix
> > socket uipc_usrreq. There it will then use sbappendcontrol to move the data
> > into the socket buffer, which is not possible, so it just returns zero. In 
> > that
> > case the sender thinks he send all the data but the receiver gets nothing, 
> > or
> > another part which was small enough later on. There is also no error 
> > signaling
> > there, so what happens is that the mbufs just leak. Maybe this should be
> > handled in a different way, but i did not realy think about that so far.
> > 
> > BR
> > 
> > Simon
> > 
> > 
> > 2016-07-01 2:45 GMT+02:00, David Gwynne <da...@gwynne.id.au>:
> >> 
> >>> On 1 Jul 2016, at 04:44, Simon Mages <mages.si...@googlemail.com> wrote:
> >>> 
> >>> Do you remember what the problem was you encounter with ospfd and your
> >>> kern_socket diff?
> >> 
> >> the unix socket pair between two of the processes closed unexpectedly or 
> >> had
> >> the wrong amount of data available. it was too long ago for me to recall
> >> correctly :(
> >> 
> >>> 
> >>> I'm not a user of ospf :)
> >> 
> >> thats fine. id hope more of the multiprocess daemons like ntpd and smtpd
> >> would exhibit the behaviour too.
> >> 
> >> do you have a way of testing performance of sockets?
> >> 
> >> dlg
> >> 
> >>> 
> >>> 2016-06-22 14:54 GMT+02:00, David Gwynne <da...@gwynne.id.au>:
> >>>> On Wed, Jun 22, 2016 at 01:58:25PM +0200, Simon Mages wrote:
> >>>>> On a System where you use the maximum socketbuffer size of 256kbyte you
> >>>>> can run out of memory after less then 9k open sockets.
> >>>>> 
> >>>>> My patch adds a new uvm_constraint for the mbufs with a bigger memory
> >>>>> area.
> >>>>> I choose this area after reading the comments in
> >>>>> sys/arch/amd64/include/pmap.h.
> >>>>> This patch further changes the maximum sucketbuffer size from 256k to
> >>>>> 1gb
> >>>>> as
> >>>>> it is described in the rfc1323 S2.3.
> >>>>> 
> >>>>> I tested this diff with the ix, em and urndis driver. I know that this
> >>>>> diff only works
> >>>>> for amd64 right now, but i wanted to send this diff as a proposal what
> >>>>> could be
> >>>>> done. Maybe somebody has a different solution for this Problem or can
> >>>>> me
> >>>>> why
> >>>>> this is a bad idea.
> >>>> 
> >>>> hey simon,
> >>>> 
> >>>> first, some background.
> >>>> 
> >>>> the 4G watermark is less about limiting the amount of memory used
> >>>> by the network stack and more about making the memory addressable
> >>>> by as many devices, including network cards, as possible. we support
> >>>> older chips that only deal with 32 bit addresses (and one or two
> >>>> stupid ones with an inability to address over 1G), so we took the
> >>>> conservative option and made made the memory generally usable without
> >>>> developers having to think about it much.
> >>>> 
> >>>> you could argue that if you should be able to give big addresses
> >>>> to modern cards, but that falls down if you are forwarding packets
> >>>> between a modern and old card, cos the old card will want to dma
> >>>> the packet the modern card rxed, but it needs it below the 4g line.
> >>>> even if you dont have an old card, in todays hotplug world you might
> >>>> plug an old device in. either way, the future of an mbuf is very
> >>>> hard for the kernel to predict.
> >>>> 
> >>>> secondly, allocating more than 4g at a time to socket buffers is
> >>>> generally a waste of memory. in practice you should scale the amount
> >>>> of memory available to sockets according to the size of the tcp
> >>>> windows you need to saturate the bandwidth available to the box.
> >>>> this means if you want to sustain a gigabit of traffic with a 300ms
> >>>> round trip time for packets, you'd "only" need ~37.5 megabytes of
> >>>> buffers. to sustain 40 gigabit you'd need 1.5 gigabytes, which is
> >>>> still below 4G. allowing more use of memory for buffers would likely
> >>>> induce latency.
> >>>> 
> >>>> the above means that if you want to sustain a single 40G tcp
> >>>> connection to that host you'd need to be able to place 1.5G on the
> >>>> socket buffer, which is above the 1G you mention above. however,
> >>>> if you want to sustain 2 connections, you ideally want to fairly
> >>>> share the 1.5G between both sockets. they should get 750M each.
> >>>> 
> >>>> fairly sharing buffers between the sockets may already be in place
> >>>> in openbsd. when i reworked the pools subsystem i set it up so
> >>>> things sleeping on memory were woken up in order.
> >>>> 
> >>>> it occurs to me that perhaps we should limit mbufs by the bytes
> >>>> they can use rather than the number of them. that would also work
> >>>> well if we moved to per cpu caches for mbufs and clusters, cos the
> >>>> number of active mbufs in the system becomes hard to limit accurately
> >>>> if we want cpus to run independently.
> >>>> 
> >>>> if you want something to work on in this area, could you look at
> >>>> letting sockets use the "jumbo" clusters instead of assuming
> >>>> everything has to be in 2k clusters? i started on thsi with the
> >>>> diff below, but it broke ospfd and i never got back to it.
> >>>> 
> >>>> if you get it working, it would be interested to test creating even
> >>>> bigger cluster pools, eg, a 1M or 4M mbuf cluster.
> >>>> 
> >>>> cheers,
> >>>> dlg
> >>>> 
> >>>> Index: uipc_socket.c
> >>>> ===================================================================
> >>>> RCS file: /cvs/src/sys/kern/uipc_socket.c,v
> >>>> retrieving revision 1.135
> >>>> diff -u -p -r1.135 uipc_socket.c
> >>>> --- uipc_socket.c        11 Dec 2014 19:21:57 -0000      1.135
> >>>> +++ uipc_socket.c        22 Dec 2014 01:11:03 -0000
> >>>> @@ -493,15 +493,18 @@ restart:
> >>>>                                  mlen = MLEN;
> >>>>                          }
> >>>>                          if (resid >= MINCLSIZE && space >= MCLBYTES) {
> >>>> -                                        MCLGET(m, M_NOWAIT);
> >>>> +                                        MCLGETI(m, M_NOWAIT, NULL, 
> >>>> lmin(resid,
> >>>> +                                            lmin(space, MAXMCLBYTES)));
> >>>>                                  if ((m->m_flags & M_EXT) == 0)
> >>>>                                          goto nopages;
> >>>>                                  if (atomic && top == 0) {
> >>>> -                                                len = lmin(MCLBYTES - 
> >>>> max_hdr,
> >>>> -                                                    resid);
> >>>> +                                                len = lmin(resid,
> >>>> +                                                    m->m_ext.ext_size -
> >>>> +                                                    max_hdr);
> >>>>                                          m->m_data += max_hdr;
> >>>>                                  } else
> >>>> -                                                len = lmin(MCLBYTES, 
> >>>> resid);
> >>>> +                                                len = lmin(resid,
> >>>> +                                                    m->m_ext.ext_size);
> >>>>                                  space -= len;
> >>>>                          } else {
> >>>> nopages:
> >>>> 
> >> 
> >> 
> 
> 
>

Re: [PATCH] let the mbufs use more then 4gb of memory

Reply via email to