On Wed, Mar 23, 2022 at 04:59:06PM +0100, Otto Moerbeek wrote: > On Wed, Mar 23, 2022 at 09:09:01PM +1000, Jonathan Matthew wrote: > > > We noticed that the ntpd engine process was getting a bit big on some boxes > > that we'd accidentally cut off from the ntp servers (routing is hard). > > Reading through the code, I noticed the 'query' member of struct ntp_peer > > is never freed, which seems to account for the leak. > > > > If you have a server pool in ntpd.conf and it resolves, but ntpd is unable > > to talk to the servers, it will re-resolve periodically, freeing the old > > list > > of peers and creating new ones. > > > > To show how slow the leak is, here's the leak report from MALLOC_OPTIONS=D > > after running for about two hours with four servers from two pools. > > > > without diff: > > > > Leak report > > f sum # avg > > 0x0 9392 128 73 > > 0x889878b920b 512 1 512 > > 0x889878bc8e1 4096 4 1024 > > 0x889878bd065 128 2 64 > > 0x88bc91f0b4b 18280 1 18280 > > 0x88bc926a9ed 65536 1 65536 > > > > > > with diff: > > > > Leak report > > f sum # avg > > 0x0 6064 16 379 > > 0xbee1253320b 512 1 512 > > 0xbf0265f4b4b 18280 1 18280 > > 0xbf02666e9ed 65536 1 65536 > > > > ok? > > > > Index: ntp.c > > =================================================================== > > RCS file: /cvs/src/usr.sbin/ntpd/ntp.c,v > > retrieving revision 1.168 > > diff -u -p -r1.168 ntp.c > > --- ntp.c 24 Oct 2021 21:24:19 -0000 1.168 > > +++ ntp.c 23 Mar 2022 10:43:59 -0000 > > @@ -686,6 +686,7 @@ void > > peer_remove(struct ntp_peer *p) > > { > > TAILQ_REMOVE(&conf->ntp_peers, p, entry); > > + free(p->query); > > free(p); > > peer_cnt--; > > } > > > > This is a bug that dlg reported last week. Serendepity or not? :-)
We found it together looking at systems we run at work, so not really. > > This is my diff that uses an approach I like a litle bit better. I agree. I wasn't sure if there was a reason the query was allocated separately, so I went with the more straightforward diff to start with. > > -Otto > > Index: client.c > =================================================================== > RCS file: /cvs/src/usr.sbin/ntpd/client.c,v > retrieving revision 1.116 > diff -u -p -r1.116 client.c > --- client.c 21 Apr 2021 09:38:11 -0000 1.116 > +++ client.c 21 Mar 2022 07:31:54 -0000 > @@ -51,10 +51,9 @@ set_deadline(struct ntp_peer *p, time_t > int > client_peer_init(struct ntp_peer *p) > { > - if ((p->query = calloc(1, sizeof(struct ntp_query))) == NULL) > - fatal("client_peer_init calloc"); > - p->query->fd = -1; > - p->query->msg.status = MODE_CLIENT | (NTP_VERSION << 3); > + p->query.fd = -1; > + p->query.msg.status = MODE_CLIENT | (NTP_VERSION << 3); > + p->query.xmttime = 0; > p->state = STATE_NONE; > p->shift = 0; > p->trustlevel = TRUSTLEVEL_PATHETIC; > @@ -91,7 +90,7 @@ client_addr_init(struct ntp_peer *p) > } > } > > - p->query->fd = -1; > + p->query.fd = -1; > set_next(p, 0); > > return (0); > @@ -100,9 +99,9 @@ client_addr_init(struct ntp_peer *p) > int > client_nextaddr(struct ntp_peer *p) > { > - if (p->query->fd != -1) { > - close(p->query->fd); > - p->query->fd = -1; > + if (p->query.fd != -1) { > + close(p->query.fd); > + p->query.fd = -1; > } > > if (p->state == STATE_DNS_INPROGRESS) > @@ -148,26 +147,26 @@ client_query(struct ntp_peer *p) > if (p->state < STATE_DNS_DONE || p->addr == NULL) > return (-1); > > - if (p->query->fd == -1) { > + if (p->query.fd == -1) { > struct sockaddr *sa = (struct sockaddr *)&p->addr->ss; > struct sockaddr *qa4 = (struct sockaddr *)&p->query_addr4; > struct sockaddr *qa6 = (struct sockaddr *)&p->query_addr6; > > - if ((p->query->fd = socket(p->addr->ss.ss_family, SOCK_DGRAM, > + if ((p->query.fd = socket(p->addr->ss.ss_family, SOCK_DGRAM, > 0)) == -1) > fatal("client_query socket"); > > if (p->addr->ss.ss_family == qa4->sa_family) { > - if (bind(p->query->fd, qa4, SA_LEN(qa4)) == -1) > + if (bind(p->query.fd, qa4, SA_LEN(qa4)) == -1) > fatal("couldn't bind to IPv4 query address: %s", > log_sockaddr(qa4)); > } else if (p->addr->ss.ss_family == qa6->sa_family) { > - if (bind(p->query->fd, qa6, SA_LEN(qa6)) == -1) > + if (bind(p->query.fd, qa6, SA_LEN(qa6)) == -1) > fatal("couldn't bind to IPv6 query address: %s", > log_sockaddr(qa6)); > } > > - if (connect(p->query->fd, sa, SA_LEN(sa)) == -1) { > + if (connect(p->query.fd, sa, SA_LEN(sa)) == -1) { > if (errno == ECONNREFUSED || errno == ENETUNREACH || > errno == EHOSTUNREACH || errno == EADDRNOTAVAIL) { > /* cycle through addresses, but do increase > @@ -183,11 +182,11 @@ client_query(struct ntp_peer *p) > fatal("client_query connect"); > } > val = IPTOS_LOWDELAY; > - if (p->addr->ss.ss_family == AF_INET && setsockopt(p->query->fd, > + if (p->addr->ss.ss_family == AF_INET && setsockopt(p->query.fd, > IPPROTO_IP, IP_TOS, &val, sizeof(val)) == -1) > log_warn("setsockopt IPTOS_LOWDELAY"); > val = 1; > - if (setsockopt(p->query->fd, SOL_SOCKET, SO_TIMESTAMP, > + if (setsockopt(p->query.fd, SOL_SOCKET, SO_TIMESTAMP, > &val, sizeof(val)) == -1) > fatal("setsockopt SO_TIMESTAMP"); > } > @@ -206,11 +205,11 @@ client_query(struct ntp_peer *p) > * Save the real transmit timestamp locally. > */ > > - p->query->msg.xmttime.int_partl = arc4random(); > - p->query->msg.xmttime.fractionl = arc4random(); > - p->query->xmttime = gettime(); > + p->query.msg.xmttime.int_partl = arc4random(); > + p->query.msg.xmttime.fractionl = arc4random(); > + p->query.xmttime = gettime(); > > - if (ntp_sendmsg(p->query->fd, NULL, &p->query->msg) == -1) { > + if (ntp_sendmsg(p->query.fd, NULL, &p->query.msg) == -1) { > p->senderrors++; > set_next(p, INTERVAL_QUERY_PATHETIC); > p->trustlevel = TRUSTLEVEL_PATHETIC; > @@ -295,7 +294,7 @@ client_dispatch(struct ntp_peer *p, u_in > somsg.msg_control = cmsgbuf.buf; > somsg.msg_controllen = sizeof(cmsgbuf.buf); > > - if ((size = recvmsg(p->query->fd, &somsg, 0)) == -1) { > + if ((size = recvmsg(p->query.fd, &somsg, 0)) == -1) { > if (errno == EHOSTUNREACH || errno == EHOSTDOWN || > errno == ENETUNREACH || errno == ENETDOWN || > errno == ECONNREFUSED || errno == EADDRNOTAVAIL || > @@ -333,8 +332,8 @@ client_dispatch(struct ntp_peer *p, u_in > > ntp_getmsg((struct sockaddr *)&p->addr->ss, buf, size, &msg); > > - if (msg.orgtime.int_partl != p->query->msg.xmttime.int_partl || > - msg.orgtime.fractionl != p->query->msg.xmttime.fractionl) > + if (msg.orgtime.int_partl != p->query.msg.xmttime.int_partl || > + msg.orgtime.fractionl != p->query.msg.xmttime.fractionl) > return (0); > > if ((msg.status & LI_ALARM) == LI_ALARM || msg.stratum == 0 || > @@ -372,7 +371,7 @@ client_dispatch(struct ntp_peer *p, u_in > * d = (T4 - T1) - (T3 - T2) t = ((T2 - T1) + (T3 - T4)) / 2. > */ > > - T1 = p->query->xmttime; > + T1 = p->query.xmttime; > T2 = lfp_to_d(msg.rectime); > T3 = lfp_to_d(msg.xmttime); > > Index: ntp.c > =================================================================== > RCS file: /cvs/src/usr.sbin/ntpd/ntp.c,v > retrieving revision 1.168 > diff -u -p -r1.168 ntp.c > --- ntp.c 24 Oct 2021 21:24:19 -0000 1.168 > +++ ntp.c 21 Mar 2022 07:31:54 -0000 > @@ -291,8 +291,8 @@ ntp_main(struct ntpd_conf *nconf, struct > nextaction = p->deadline; > > if (p->state == STATE_QUERY_SENT && > - p->query->fd != -1) { > - pfd[i].fd = p->query->fd; > + p->query.fd != -1) { > + pfd[i].fd = p->query.fd; > pfd[i].events = POLLIN; > idx2peer[i - idx_peers] = p; > i++; > Index: ntpd.h > =================================================================== > RCS file: /cvs/src/usr.sbin/ntpd/ntpd.h,v > retrieving revision 1.150 > diff -u -p -r1.150 ntpd.h > --- ntpd.h 30 Aug 2020 16:21:29 -0000 1.150 > +++ ntpd.h 21 Mar 2022 07:31:54 -0000 > @@ -157,8 +157,8 @@ struct ntp_offset { > struct ntp_peer { > TAILQ_ENTRY(ntp_peer) entry; > struct ntp_addr_wrap addr_head; > + struct ntp_query query; > struct ntp_addr *addr; > - struct ntp_query *query; > struct ntp_offset reply[OFFSET_ARRAY_SIZE]; > struct ntp_offset update; > struct sockaddr_in query_addr4;