Hi,

I have a setup similar to the one of the original reporter.
My NodeName is localhost .

The error messages at booting time scared me, so I dug the issue.
I also related this issue to my observation that slurm fails to launch jobs
when my standalone computer is disconnected (the router provided by my ISP
is very unstable). I could reproduced the issue with a simple C program
that mimics get_addr_info function. After some trials, it appears that the issue
disappears when the hints.ai_flags do not include the AI_ADDRCONFIG flag
(see get_addr_info(3) for more information). So the current workaround
patch `retry-getaddrinfo` only fixes the issue partially.

The following patch neutralize the setup of the AI_ADDRCONFIG flag:

============================8><--------------------------------------------------------
--- a/src/common/conmgr.c
+++ b/src/common/conmgr.c
@@ -1807,7 +1807,7 @@
        struct addrinfo hints = { .ai_family = AF_UNSPEC,
                                  .ai_socktype = SOCK_STREAM,
                                  .ai_protocol = 0,
-                                 .ai_flags = AI_PASSIVE | AI_ADDRCONFIG };
+                                 .ai_flags = AI_PASSIVE /*| AI_ADDRCONFIG */ };
        struct addrinfo *addrlist = NULL;
        parsed_host_port_t *parsed_hp;
--- a/src/common/util-net.c
+++ b/src/common/util-net.c
@@ -261,7 +261,7 @@
        else
                hints.ai_family = AF_UNSPEC;
- hints.ai_flags = AI_ADDRCONFIG | AI_NUMERICSERV | AI_PASSIVE;
+       hints.ai_flags = /* AI_ADDRCONFIG | */ AI_NUMERICSERV | AI_PASSIVE;
        if (hostname)
                hints.ai_flags |= AI_CANONNAME;
        hints.ai_socktype = SOCK_STREAM;
----------------------------><8========================================================

I guess that this patch is too brutal and that it must be refined.
In particular, the flag  may not be AI_ADDRCONFIG set up only on standalone 
computer.
However I am not familiar enough with slurm and network stuff to step further.

Here is the simple C program that helps me to isolate better the issue:

============================8><--------------------------------------------------------
// `example-getaddrinfo-00.c'  C source file

// gcc -Wall -o example-getaddrinfo-00 example-getaddrinfo-00.c
// $ ./example-getaddrinfo-00
// $ ./example-getaddrinfo-00 localhost
// $ ./example-getaddrinfo-00 debian.org

#include <sys/types.h>
#include <sys/socket.h>
#include <netdb.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <arpa/inet.h>

int main(int nargs, char *args[]) {
        char nodename[1024]="localhost";
        const char serv[6]="6817";
        struct addrinfo hints;
        struct addrinfo * result=NULL;
        struct addrinfo * rdx=NULL;
        struct sockaddr_in * ai_addr_v4=NULL;
        char sa_str[INET6_ADDRSTRLEN];
        char * xnodename=NULL;
        int status=0;

        if (1<nargs) {
                snprintf(nodename,sizeof(nodename),"%s",args[1]);
                }
        if (strcmp(nodename,"NULL")) {
                xnodename=nodename;
                }

        memset(&hints,0,sizeof(hints));
        hints.ai_family=AF_INET;
        hints.ai_flags= AI_NUMERICSERV | AI_PASSIVE | AI_CANONNAME ;
#if 0
        hints.ai_flags |= AI_ADDRCONFIG ;
#endif
        hints.ai_socktype=SOCK_STREAM;
        status=getaddrinfo(xnodename,serv,&hints,&result);
        if (status) {
                fprintf(stderr,"FAIL:getaddrinfo: 
``%s''\n",gai_strerror(status));
                }
        for(rdx=result;rdx!=NULL;rdx=rdx->ai_next) {
                ai_addr_v4=(struct sockaddr_in *)(rdx->ai_addr);
                
inet_ntop(AF_INET,&(ai_addr_v4->sin_addr),sa_str,sizeof(sa_str));
                fprintf(stdout,">%s< >%s<\n",result->ai_canonname,sa_str);
                }
        freeaddrinfo(result); result=NULL;

        return (status); }
----------------------------><8========================================================

hth,
Jerome
--
Jerome BENOIT | calculus+at-rezozer^dot*net
https://qa.debian.org/developer.php?login=calcu...@rezozer.net
AE28 AE15 710D FF1D 87E5  A762 3F92 19A6 7F36 C68B

Reply via email to