On Thu, Mar 24, 2022 at 10:34:52AM +1000, Jonathan Matthew wrote:

> On Wed, Mar 23, 2022 at 04:59:06PM +0100, Otto Moerbeek wrote:
> > On Wed, Mar 23, 2022 at 09:09:01PM +1000, Jonathan Matthew wrote:
> > 
> > > We noticed that the ntpd engine process was getting a bit big on some 
> > > boxes
> > > that we'd accidentally cut off from the ntp servers (routing is hard).
> > > Reading through the code, I noticed the 'query' member of struct ntp_peer
> > > is never freed, which seems to account for the leak.
> > > 
> > > If you have a server pool in ntpd.conf and it resolves, but ntpd is unable
> > > to talk to the servers, it will re-resolve periodically, freeing the old 
> > > list
> > > of peers and creating new ones.
> > > 
> > > To show how slow the leak is, here's the leak report from MALLOC_OPTIONS=D
> > > after running for about two hours with four servers from two pools.
> > > 
> > > without diff:
> > >  
> > > Leak report
> > >                  f     sum      #    avg
> > >                0x0    9392    128     73
> > >      0x889878b920b     512      1    512
> > >      0x889878bc8e1    4096      4   1024
> > >      0x889878bd065     128      2     64
> > >      0x88bc91f0b4b   18280      1  18280
> > >      0x88bc926a9ed   65536      1  65536
> > >  
> > >  
> > > with diff:
> > >  
> > > Leak report
> > >                  f     sum      #    avg
> > >                0x0    6064     16    379
> > >      0xbee1253320b     512      1    512
> > >      0xbf0265f4b4b   18280      1  18280
> > >      0xbf02666e9ed   65536      1  65536
> > > 
> > > ok?
> > > 
> > > Index: ntp.c
> > > ===================================================================
> > > RCS file: /cvs/src/usr.sbin/ntpd/ntp.c,v
> > > retrieving revision 1.168
> > > diff -u -p -r1.168 ntp.c
> > > --- ntp.c 24 Oct 2021 21:24:19 -0000      1.168
> > > +++ ntp.c 23 Mar 2022 10:43:59 -0000
> > > @@ -686,6 +686,7 @@ void
> > >  peer_remove(struct ntp_peer *p)
> > >  {
> > >   TAILQ_REMOVE(&conf->ntp_peers, p, entry);
> > > + free(p->query);
> > >   free(p);
> > >   peer_cnt--;
> > >  }
> > > 
> > 
> > This is a bug that dlg reported last week. Serendepity or not? :-)
> 
> We found it together looking at systems we run at work, so not really.

heh, dlg was talking about a collegue but i did not connnect that to you,

anyway, thanks for the analysis,

        -Otto

> 
> > 
> > This is my diff that uses an approach I like a litle bit better.
> 
> I agree.  I wasn't sure if there was a reason the query was allocated
> separately, so I went with the more straightforward diff to start with.
> 
> > 
> >     -Otto
> > 
> > Index: client.c
> > ===================================================================
> > RCS file: /cvs/src/usr.sbin/ntpd/client.c,v
> > retrieving revision 1.116
> > diff -u -p -r1.116 client.c
> > --- client.c        21 Apr 2021 09:38:11 -0000      1.116
> > +++ client.c        21 Mar 2022 07:31:54 -0000
> > @@ -51,10 +51,9 @@ set_deadline(struct ntp_peer *p, time_t 
> >  int
> >  client_peer_init(struct ntp_peer *p)
> >  {
> > -   if ((p->query = calloc(1, sizeof(struct ntp_query))) == NULL)
> > -           fatal("client_peer_init calloc");
> > -   p->query->fd = -1;
> > -   p->query->msg.status = MODE_CLIENT | (NTP_VERSION << 3);
> > +   p->query.fd = -1;
> > +   p->query.msg.status = MODE_CLIENT | (NTP_VERSION << 3);
> > +   p->query.xmttime = 0;
> >     p->state = STATE_NONE;
> >     p->shift = 0;
> >     p->trustlevel = TRUSTLEVEL_PATHETIC;
> > @@ -91,7 +90,7 @@ client_addr_init(struct ntp_peer *p)
> >             }
> >     }
> >  
> > -   p->query->fd = -1;
> > +   p->query.fd = -1;
> >     set_next(p, 0);
> >  
> >     return (0);
> > @@ -100,9 +99,9 @@ client_addr_init(struct ntp_peer *p)
> >  int
> >  client_nextaddr(struct ntp_peer *p)
> >  {
> > -   if (p->query->fd != -1) {
> > -           close(p->query->fd);
> > -           p->query->fd = -1;
> > +   if (p->query.fd != -1) {
> > +           close(p->query.fd);
> > +           p->query.fd = -1;
> >     }
> >  
> >     if (p->state == STATE_DNS_INPROGRESS)
> > @@ -148,26 +147,26 @@ client_query(struct ntp_peer *p)
> >     if (p->state < STATE_DNS_DONE || p->addr == NULL)
> >             return (-1);
> >  
> > -   if (p->query->fd == -1) {
> > +   if (p->query.fd == -1) {
> >             struct sockaddr *sa = (struct sockaddr *)&p->addr->ss;
> >             struct sockaddr *qa4 = (struct sockaddr *)&p->query_addr4;
> >             struct sockaddr *qa6 = (struct sockaddr *)&p->query_addr6;
> >  
> > -           if ((p->query->fd = socket(p->addr->ss.ss_family, SOCK_DGRAM,
> > +           if ((p->query.fd = socket(p->addr->ss.ss_family, SOCK_DGRAM,
> >                 0)) == -1)
> >                     fatal("client_query socket");
> >  
> >             if (p->addr->ss.ss_family == qa4->sa_family) {
> > -                   if (bind(p->query->fd, qa4, SA_LEN(qa4)) == -1)
> > +                   if (bind(p->query.fd, qa4, SA_LEN(qa4)) == -1)
> >                             fatal("couldn't bind to IPv4 query address: %s",
> >                                 log_sockaddr(qa4));
> >             } else if (p->addr->ss.ss_family == qa6->sa_family) {
> > -                   if (bind(p->query->fd, qa6, SA_LEN(qa6)) == -1)
> > +                   if (bind(p->query.fd, qa6, SA_LEN(qa6)) == -1)
> >                             fatal("couldn't bind to IPv6 query address: %s",
> >                                 log_sockaddr(qa6));
> >             }
> >  
> > -           if (connect(p->query->fd, sa, SA_LEN(sa)) == -1) {
> > +           if (connect(p->query.fd, sa, SA_LEN(sa)) == -1) {
> >                     if (errno == ECONNREFUSED || errno == ENETUNREACH ||
> >                         errno == EHOSTUNREACH || errno == EADDRNOTAVAIL) {
> >                             /* cycle through addresses, but do increase
> > @@ -183,11 +182,11 @@ client_query(struct ntp_peer *p)
> >                             fatal("client_query connect");
> >             }
> >             val = IPTOS_LOWDELAY;
> > -           if (p->addr->ss.ss_family == AF_INET && setsockopt(p->query->fd,
> > +           if (p->addr->ss.ss_family == AF_INET && setsockopt(p->query.fd,
> >                 IPPROTO_IP, IP_TOS, &val, sizeof(val)) == -1)
> >                     log_warn("setsockopt IPTOS_LOWDELAY");
> >             val = 1;
> > -           if (setsockopt(p->query->fd, SOL_SOCKET, SO_TIMESTAMP,
> > +           if (setsockopt(p->query.fd, SOL_SOCKET, SO_TIMESTAMP,
> >                 &val, sizeof(val)) == -1)
> >                     fatal("setsockopt SO_TIMESTAMP");
> >     }
> > @@ -206,11 +205,11 @@ client_query(struct ntp_peer *p)
> >      * Save the real transmit timestamp locally.
> >      */
> >  
> > -   p->query->msg.xmttime.int_partl = arc4random();
> > -   p->query->msg.xmttime.fractionl = arc4random();
> > -   p->query->xmttime = gettime();
> > +   p->query.msg.xmttime.int_partl = arc4random();
> > +   p->query.msg.xmttime.fractionl = arc4random();
> > +   p->query.xmttime = gettime();
> >  
> > -   if (ntp_sendmsg(p->query->fd, NULL, &p->query->msg) == -1) {
> > +   if (ntp_sendmsg(p->query.fd, NULL, &p->query.msg) == -1) {
> >             p->senderrors++;
> >             set_next(p, INTERVAL_QUERY_PATHETIC);
> >             p->trustlevel = TRUSTLEVEL_PATHETIC;
> > @@ -295,7 +294,7 @@ client_dispatch(struct ntp_peer *p, u_in
> >     somsg.msg_control = cmsgbuf.buf;
> >     somsg.msg_controllen = sizeof(cmsgbuf.buf);
> >  
> > -   if ((size = recvmsg(p->query->fd, &somsg, 0)) == -1) {
> > +   if ((size = recvmsg(p->query.fd, &somsg, 0)) == -1) {
> >             if (errno == EHOSTUNREACH || errno == EHOSTDOWN ||
> >                 errno == ENETUNREACH || errno == ENETDOWN ||
> >                 errno == ECONNREFUSED || errno == EADDRNOTAVAIL ||
> > @@ -333,8 +332,8 @@ client_dispatch(struct ntp_peer *p, u_in
> >  
> >     ntp_getmsg((struct sockaddr *)&p->addr->ss, buf, size, &msg);
> >  
> > -   if (msg.orgtime.int_partl != p->query->msg.xmttime.int_partl ||
> > -       msg.orgtime.fractionl != p->query->msg.xmttime.fractionl)
> > +   if (msg.orgtime.int_partl != p->query.msg.xmttime.int_partl ||
> > +       msg.orgtime.fractionl != p->query.msg.xmttime.fractionl)
> >             return (0);
> >  
> >     if ((msg.status & LI_ALARM) == LI_ALARM || msg.stratum == 0 ||
> > @@ -372,7 +371,7 @@ client_dispatch(struct ntp_peer *p, u_in
> >      *    d = (T4 - T1) - (T3 - T2)     t = ((T2 - T1) + (T3 - T4)) / 2.
> >      */
> >  
> > -   T1 = p->query->xmttime;
> > +   T1 = p->query.xmttime;
> >     T2 = lfp_to_d(msg.rectime);
> >     T3 = lfp_to_d(msg.xmttime);
> >  
> > Index: ntp.c
> > ===================================================================
> > RCS file: /cvs/src/usr.sbin/ntpd/ntp.c,v
> > retrieving revision 1.168
> > diff -u -p -r1.168 ntp.c
> > --- ntp.c   24 Oct 2021 21:24:19 -0000      1.168
> > +++ ntp.c   21 Mar 2022 07:31:54 -0000
> > @@ -291,8 +291,8 @@ ntp_main(struct ntpd_conf *nconf, struct
> >                             nextaction = p->deadline;
> >  
> >                     if (p->state == STATE_QUERY_SENT &&
> > -                       p->query->fd != -1) {
> > -                           pfd[i].fd = p->query->fd;
> > +                       p->query.fd != -1) {
> > +                           pfd[i].fd = p->query.fd;
> >                             pfd[i].events = POLLIN;
> >                             idx2peer[i - idx_peers] = p;
> >                             i++;
> > Index: ntpd.h
> > ===================================================================
> > RCS file: /cvs/src/usr.sbin/ntpd/ntpd.h,v
> > retrieving revision 1.150
> > diff -u -p -r1.150 ntpd.h
> > --- ntpd.h  30 Aug 2020 16:21:29 -0000      1.150
> > +++ ntpd.h  21 Mar 2022 07:31:54 -0000
> > @@ -157,8 +157,8 @@ struct ntp_offset {
> >  struct ntp_peer {
> >     TAILQ_ENTRY(ntp_peer)            entry;
> >     struct ntp_addr_wrap             addr_head;
> > +   struct ntp_query                 query;
> >     struct ntp_addr                 *addr;
> > -   struct ntp_query                *query;
> >     struct ntp_offset                reply[OFFSET_ARRAY_SIZE];
> >     struct ntp_offset                update;
> >     struct sockaddr_in               query_addr4;

Reply via email to