Hi,
I have a setup similar to the one of the original reporter.
My NodeName is localhost .
The error messages at booting time scared me, so I dug the issue.
I also related this issue to my observation that slurm fails to launch jobs
when my standalone computer is disconnected (the router provided by my ISP
is very unstable). I could reproduced the issue with a simple C program
that mimics get_addr_info function. After some trials, it appears that the issue
disappears when the hints.ai_flags do not include the AI_ADDRCONFIG flag
(see get_addr_info(3) for more information). So the current workaround
patch `retry-getaddrinfo` only fixes the issue partially.
The following patch neutralize the setup of the AI_ADDRCONFIG flag:
============================8><--------------------------------------------------------
--- a/src/common/conmgr.c
+++ b/src/common/conmgr.c
@@ -1807,7 +1807,7 @@
struct addrinfo hints = { .ai_family = AF_UNSPEC,
.ai_socktype = SOCK_STREAM,
.ai_protocol = 0,
- .ai_flags = AI_PASSIVE | AI_ADDRCONFIG };
+ .ai_flags = AI_PASSIVE /*| AI_ADDRCONFIG */ };
struct addrinfo *addrlist = NULL;
parsed_host_port_t *parsed_hp;
--- a/src/common/util-net.c
+++ b/src/common/util-net.c
@@ -261,7 +261,7 @@
else
hints.ai_family = AF_UNSPEC;
- hints.ai_flags = AI_ADDRCONFIG | AI_NUMERICSERV | AI_PASSIVE;
+ hints.ai_flags = /* AI_ADDRCONFIG | */ AI_NUMERICSERV | AI_PASSIVE;
if (hostname)
hints.ai_flags |= AI_CANONNAME;
hints.ai_socktype = SOCK_STREAM;
----------------------------><8========================================================
I guess that this patch is too brutal and that it must be refined.
In particular, the flag may not be AI_ADDRCONFIG set up only on standalone
computer.
However I am not familiar enough with slurm and network stuff to step further.
Here is the simple C program that helps me to isolate better the issue:
============================8><--------------------------------------------------------
// `example-getaddrinfo-00.c' C source file
// gcc -Wall -o example-getaddrinfo-00 example-getaddrinfo-00.c
// $ ./example-getaddrinfo-00
// $ ./example-getaddrinfo-00 localhost
// $ ./example-getaddrinfo-00 debian.org
#include <sys/types.h>
#include <sys/socket.h>
#include <netdb.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <arpa/inet.h>
int main(int nargs, char *args[]) {
char nodename[1024]="localhost";
const char serv[6]="6817";
struct addrinfo hints;
struct addrinfo * result=NULL;
struct addrinfo * rdx=NULL;
struct sockaddr_in * ai_addr_v4=NULL;
char sa_str[INET6_ADDRSTRLEN];
char * xnodename=NULL;
int status=0;
if (1<nargs) {
snprintf(nodename,sizeof(nodename),"%s",args[1]);
}
if (strcmp(nodename,"NULL")) {
xnodename=nodename;
}
memset(&hints,0,sizeof(hints));
hints.ai_family=AF_INET;
hints.ai_flags= AI_NUMERICSERV | AI_PASSIVE | AI_CANONNAME ;
#if 0
hints.ai_flags |= AI_ADDRCONFIG ;
#endif
hints.ai_socktype=SOCK_STREAM;
status=getaddrinfo(xnodename,serv,&hints,&result);
if (status) {
fprintf(stderr,"FAIL:getaddrinfo:
``%s''\n",gai_strerror(status));
}
for(rdx=result;rdx!=NULL;rdx=rdx->ai_next) {
ai_addr_v4=(struct sockaddr_in *)(rdx->ai_addr);
inet_ntop(AF_INET,&(ai_addr_v4->sin_addr),sa_str,sizeof(sa_str));
fprintf(stdout,">%s< >%s<\n",result->ai_canonname,sa_str);
}
freeaddrinfo(result); result=NULL;
return (status); }
----------------------------><8========================================================
hth,
Jerome
--
Jerome BENOIT | calculus+at-rezozer^dot*net
https://qa.debian.org/developer.php?login=calcu...@rezozer.net
AE28 AE15 710D FF1D 87E5 A762 3F92 19A6 7F36 C68B