On Wed, Mar 23, 2022 at 04:59:06PM +0100, Otto Moerbeek wrote:
> On Wed, Mar 23, 2022 at 09:09:01PM +1000, Jonathan Matthew wrote:
> 
> > We noticed that the ntpd engine process was getting a bit big on some boxes
> > that we'd accidentally cut off from the ntp servers (routing is hard).
> > Reading through the code, I noticed the 'query' member of struct ntp_peer
> > is never freed, which seems to account for the leak.
> > 
> > If you have a server pool in ntpd.conf and it resolves, but ntpd is unable
> > to talk to the servers, it will re-resolve periodically, freeing the old 
> > list
> > of peers and creating new ones.
> > 
> > To show how slow the leak is, here's the leak report from MALLOC_OPTIONS=D
> > after running for about two hours with four servers from two pools.
> > 
> > without diff:
> >  
> > Leak report
> >                  f     sum      #    avg
> >                0x0    9392    128     73
> >      0x889878b920b     512      1    512
> >      0x889878bc8e1    4096      4   1024
> >      0x889878bd065     128      2     64
> >      0x88bc91f0b4b   18280      1  18280
> >      0x88bc926a9ed   65536      1  65536
> >  
> >  
> > with diff:
> >  
> > Leak report
> >                  f     sum      #    avg
> >                0x0    6064     16    379
> >      0xbee1253320b     512      1    512
> >      0xbf0265f4b4b   18280      1  18280
> >      0xbf02666e9ed   65536      1  65536
> > 
> > ok?
> > 
> > Index: ntp.c
> > ===================================================================
> > RCS file: /cvs/src/usr.sbin/ntpd/ntp.c,v
> > retrieving revision 1.168
> > diff -u -p -r1.168 ntp.c
> > --- ntp.c   24 Oct 2021 21:24:19 -0000      1.168
> > +++ ntp.c   23 Mar 2022 10:43:59 -0000
> > @@ -686,6 +686,7 @@ void
> >  peer_remove(struct ntp_peer *p)
> >  {
> >     TAILQ_REMOVE(&conf->ntp_peers, p, entry);
> > +   free(p->query);
> >     free(p);
> >     peer_cnt--;
> >  }
> > 
> 
> This is a bug that dlg reported last week. Serendepity or not? :-)

We found it together looking at systems we run at work, so not really.

> 
> This is my diff that uses an approach I like a litle bit better.

I agree.  I wasn't sure if there was a reason the query was allocated
separately, so I went with the more straightforward diff to start with.

> 
>       -Otto
> 
> Index: client.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/ntpd/client.c,v
> retrieving revision 1.116
> diff -u -p -r1.116 client.c
> --- client.c  21 Apr 2021 09:38:11 -0000      1.116
> +++ client.c  21 Mar 2022 07:31:54 -0000
> @@ -51,10 +51,9 @@ set_deadline(struct ntp_peer *p, time_t 
>  int
>  client_peer_init(struct ntp_peer *p)
>  {
> -     if ((p->query = calloc(1, sizeof(struct ntp_query))) == NULL)
> -             fatal("client_peer_init calloc");
> -     p->query->fd = -1;
> -     p->query->msg.status = MODE_CLIENT | (NTP_VERSION << 3);
> +     p->query.fd = -1;
> +     p->query.msg.status = MODE_CLIENT | (NTP_VERSION << 3);
> +     p->query.xmttime = 0;
>       p->state = STATE_NONE;
>       p->shift = 0;
>       p->trustlevel = TRUSTLEVEL_PATHETIC;
> @@ -91,7 +90,7 @@ client_addr_init(struct ntp_peer *p)
>               }
>       }
>  
> -     p->query->fd = -1;
> +     p->query.fd = -1;
>       set_next(p, 0);
>  
>       return (0);
> @@ -100,9 +99,9 @@ client_addr_init(struct ntp_peer *p)
>  int
>  client_nextaddr(struct ntp_peer *p)
>  {
> -     if (p->query->fd != -1) {
> -             close(p->query->fd);
> -             p->query->fd = -1;
> +     if (p->query.fd != -1) {
> +             close(p->query.fd);
> +             p->query.fd = -1;
>       }
>  
>       if (p->state == STATE_DNS_INPROGRESS)
> @@ -148,26 +147,26 @@ client_query(struct ntp_peer *p)
>       if (p->state < STATE_DNS_DONE || p->addr == NULL)
>               return (-1);
>  
> -     if (p->query->fd == -1) {
> +     if (p->query.fd == -1) {
>               struct sockaddr *sa = (struct sockaddr *)&p->addr->ss;
>               struct sockaddr *qa4 = (struct sockaddr *)&p->query_addr4;
>               struct sockaddr *qa6 = (struct sockaddr *)&p->query_addr6;
>  
> -             if ((p->query->fd = socket(p->addr->ss.ss_family, SOCK_DGRAM,
> +             if ((p->query.fd = socket(p->addr->ss.ss_family, SOCK_DGRAM,
>                   0)) == -1)
>                       fatal("client_query socket");
>  
>               if (p->addr->ss.ss_family == qa4->sa_family) {
> -                     if (bind(p->query->fd, qa4, SA_LEN(qa4)) == -1)
> +                     if (bind(p->query.fd, qa4, SA_LEN(qa4)) == -1)
>                               fatal("couldn't bind to IPv4 query address: %s",
>                                   log_sockaddr(qa4));
>               } else if (p->addr->ss.ss_family == qa6->sa_family) {
> -                     if (bind(p->query->fd, qa6, SA_LEN(qa6)) == -1)
> +                     if (bind(p->query.fd, qa6, SA_LEN(qa6)) == -1)
>                               fatal("couldn't bind to IPv6 query address: %s",
>                                   log_sockaddr(qa6));
>               }
>  
> -             if (connect(p->query->fd, sa, SA_LEN(sa)) == -1) {
> +             if (connect(p->query.fd, sa, SA_LEN(sa)) == -1) {
>                       if (errno == ECONNREFUSED || errno == ENETUNREACH ||
>                           errno == EHOSTUNREACH || errno == EADDRNOTAVAIL) {
>                               /* cycle through addresses, but do increase
> @@ -183,11 +182,11 @@ client_query(struct ntp_peer *p)
>                               fatal("client_query connect");
>               }
>               val = IPTOS_LOWDELAY;
> -             if (p->addr->ss.ss_family == AF_INET && setsockopt(p->query->fd,
> +             if (p->addr->ss.ss_family == AF_INET && setsockopt(p->query.fd,
>                   IPPROTO_IP, IP_TOS, &val, sizeof(val)) == -1)
>                       log_warn("setsockopt IPTOS_LOWDELAY");
>               val = 1;
> -             if (setsockopt(p->query->fd, SOL_SOCKET, SO_TIMESTAMP,
> +             if (setsockopt(p->query.fd, SOL_SOCKET, SO_TIMESTAMP,
>                   &val, sizeof(val)) == -1)
>                       fatal("setsockopt SO_TIMESTAMP");
>       }
> @@ -206,11 +205,11 @@ client_query(struct ntp_peer *p)
>        * Save the real transmit timestamp locally.
>        */
>  
> -     p->query->msg.xmttime.int_partl = arc4random();
> -     p->query->msg.xmttime.fractionl = arc4random();
> -     p->query->xmttime = gettime();
> +     p->query.msg.xmttime.int_partl = arc4random();
> +     p->query.msg.xmttime.fractionl = arc4random();
> +     p->query.xmttime = gettime();
>  
> -     if (ntp_sendmsg(p->query->fd, NULL, &p->query->msg) == -1) {
> +     if (ntp_sendmsg(p->query.fd, NULL, &p->query.msg) == -1) {
>               p->senderrors++;
>               set_next(p, INTERVAL_QUERY_PATHETIC);
>               p->trustlevel = TRUSTLEVEL_PATHETIC;
> @@ -295,7 +294,7 @@ client_dispatch(struct ntp_peer *p, u_in
>       somsg.msg_control = cmsgbuf.buf;
>       somsg.msg_controllen = sizeof(cmsgbuf.buf);
>  
> -     if ((size = recvmsg(p->query->fd, &somsg, 0)) == -1) {
> +     if ((size = recvmsg(p->query.fd, &somsg, 0)) == -1) {
>               if (errno == EHOSTUNREACH || errno == EHOSTDOWN ||
>                   errno == ENETUNREACH || errno == ENETDOWN ||
>                   errno == ECONNREFUSED || errno == EADDRNOTAVAIL ||
> @@ -333,8 +332,8 @@ client_dispatch(struct ntp_peer *p, u_in
>  
>       ntp_getmsg((struct sockaddr *)&p->addr->ss, buf, size, &msg);
>  
> -     if (msg.orgtime.int_partl != p->query->msg.xmttime.int_partl ||
> -         msg.orgtime.fractionl != p->query->msg.xmttime.fractionl)
> +     if (msg.orgtime.int_partl != p->query.msg.xmttime.int_partl ||
> +         msg.orgtime.fractionl != p->query.msg.xmttime.fractionl)
>               return (0);
>  
>       if ((msg.status & LI_ALARM) == LI_ALARM || msg.stratum == 0 ||
> @@ -372,7 +371,7 @@ client_dispatch(struct ntp_peer *p, u_in
>        *    d = (T4 - T1) - (T3 - T2)     t = ((T2 - T1) + (T3 - T4)) / 2.
>        */
>  
> -     T1 = p->query->xmttime;
> +     T1 = p->query.xmttime;
>       T2 = lfp_to_d(msg.rectime);
>       T3 = lfp_to_d(msg.xmttime);
>  
> Index: ntp.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/ntpd/ntp.c,v
> retrieving revision 1.168
> diff -u -p -r1.168 ntp.c
> --- ntp.c     24 Oct 2021 21:24:19 -0000      1.168
> +++ ntp.c     21 Mar 2022 07:31:54 -0000
> @@ -291,8 +291,8 @@ ntp_main(struct ntpd_conf *nconf, struct
>                               nextaction = p->deadline;
>  
>                       if (p->state == STATE_QUERY_SENT &&
> -                         p->query->fd != -1) {
> -                             pfd[i].fd = p->query->fd;
> +                         p->query.fd != -1) {
> +                             pfd[i].fd = p->query.fd;
>                               pfd[i].events = POLLIN;
>                               idx2peer[i - idx_peers] = p;
>                               i++;
> Index: ntpd.h
> ===================================================================
> RCS file: /cvs/src/usr.sbin/ntpd/ntpd.h,v
> retrieving revision 1.150
> diff -u -p -r1.150 ntpd.h
> --- ntpd.h    30 Aug 2020 16:21:29 -0000      1.150
> +++ ntpd.h    21 Mar 2022 07:31:54 -0000
> @@ -157,8 +157,8 @@ struct ntp_offset {
>  struct ntp_peer {
>       TAILQ_ENTRY(ntp_peer)            entry;
>       struct ntp_addr_wrap             addr_head;
> +     struct ntp_query                 query;
>       struct ntp_addr                 *addr;
> -     struct ntp_query                *query;
>       struct ntp_offset                reply[OFFSET_ARRAY_SIZE];
>       struct ntp_offset                update;
>       struct sockaddr_in               query_addr4;

Reply via email to