Hi. We recently upgraded an appliance we have from Ubuntu 18.04 to 22.04, and 
are now seeing intermittent issues with RDM message.
(Kernel 5.15.0-75)
This issue was never seen on Ubuntu 18.04 and older (Ubuntu kernel 4.15.0)

I have a minimal test program set that will have the issue seen.

Basically we have a client side that opens a semi-random tipc-instance port, 
sends a message to a server on a known address.
Part of the message is the client instance address, so the server can send back 
a reply.

Occasionally the server reply is lost.
We've tracked it down to that occasionally the publish message trails the 
actual message on the server side. When that happens, apparently the server 
tipc stack is dropping the message.

This ONLY happens when client and server are on two different pieces of 
hardware (fairly beefy supermicro server with 2 Xeon 5110 or Silver)

Here is a TCPDump of the sequence where it fails:
1             2023-06-20 16:23:09.761711     1.1.1     0.0.0     TIPC       74  
         Name Dist    Publication type:226 inst:19712
2             2023-06-20 16:23:09.761758     1.1.1     1.1.2     TIPC       105 
       Payld:Low    NamedMsg type:226 inst:2
3             2023-06-20 16:23:09.761770     1.1.2     1.1.1     TIPC       105 
       Payld:Low    NamedMsg type:226 inst:19712
4             2023-06-20 16:23:09.761906     1.1.1     0.0.0     TIPC       74  
         Name Dist    Withdrawal type:226 inst:19712

5             2023-06-20 16:23:09.761906     1.1.1     0.0.0     TIPC       74  
         Name Dist    Publication type:226 inst:19968
6             2023-06-20 16:23:09.761954     1.1.1     1.1.2     TIPC       105 
       Payld:Low    NamedMsg type:226 inst:2
7             2023-06-20 16:23:09.761965     1.1.2     1.1.1     TIPC       105 
       Payld:Low    NamedMsg type:226 inst:19968
8             2023-06-20 16:23:09.762054     1.1.1     0.0.0     TIPC       74  
         Name Dist    Withdrawal type:226 inst:19968

9             2023-06-20 16:23:09.762054     1.1.1     0.0.0     TIPC       74  
         Name Dist    Publication type:226 inst:20224
10           2023-06-20 16:23:09.762101     1.1.1     1.1.2     TIPC       105  
      Payld:Low    NamedMsg type:226 inst:2
11           2023-06-20 16:23:09.762112     1.1.2     1.1.1     TIPC       105  
      Payld:Low    NamedMsg type:226 inst:20224
12           2023-06-20 16:23:09.762250     1.1.1     0.0.0     TIPC       74   
        Name Dist    Withdrawal type:226 inst:20224

13           2023-06-20 16:23:09.762250     1.1.1     0.0.0     TIPC       74   
        Name Dist    Publication type:226 inst:20480
14           2023-06-20 16:23:09.762250     1.1.1     1.1.2     TIPC       105  
      Payld:Low    NamedMsg type:226 inst:2
15           2023-06-20 16:23:09.762254     1.1.2     1.1.1     TIPC       58   
        Link State   State
16           2023-06-20 16:23:09.762267     1.1.2     1.1.1     TIPC       105  
      Payld:Low    NamedMsg type:226 inst:20480
17           2023-06-20 16:23:09.762396     1.1.1     0.0.0     TIPC       74   
        Name Dist    Withdrawal type:226 inst:20480

18           2023-06-20 16:23:09.762396     1.1.1     0.0.0     TIPC       74   
        Name Dist    Publication type:226 inst:20736
19           2023-06-20 16:23:09.762397     1.1.1     1.1.2     TIPC       105  
      Payld:Low    NamedMsg type:226 inst:2
20           2023-06-20 16:23:09.762410     1.1.2     1.1.1     TIPC       105  
      Payld:Low    NamedMsg type:226 inst:20736

21           2023-06-20 16:23:09.762592     1.1.1     1.1.2     TIPC       105  
      Payld:Low    NamedMsg type:226 inst:2
22           2023-06-20 16:23:09.762656     1.1.1     0.0.0     TIPC       74   
        Name Dist    Withdrawal type:226 inst:20736
23           2023-06-20 16:23:09.762656     1.1.1     0.0.0     TIPC       74   
        Name Dist    Publication type:226 inst:20992

Packet 23 is the publish, while 21 is the payload.
I would have assumed (and I think the older tipc driver did so) that a datagram 
received from a source that is not yet published would update the nametable too?

Test programs will usually recreate within 50-100 iterations
Here is the client side that will recreate:
#include <stdio.h>
#include <stdlib.h>
#include <sys/socket.h>
#include <linux/tipc.h>
#include <string.h>
#include <unistd.h>

int main(int argc, char * argv[])
{
    int sendsize = 51;
    char buf[65535];
    memset(buf, 0, sendsize);
    int c = 0;

    struct sockaddr_tipc to_addr;

    to_addr.family = AF_TIPC;
    to_addr.scope = 0;
    to_addr.addrtype = TIPC_ADDR_NAME;
    to_addr.addr.name.name.type = 226;
    to_addr.addr.name.name.instance = 2;
    to_addr.addr.name.domain = 0;

    for (int i = 0; i < 1000; i++)
    {
        int addr = (i & 0xff) << 8;

        int sock = socket(AF_TIPC, SOCK_RDM | SOCK_CLOEXEC, 0);
        if (sock == -1)
            perror("opening socket");
        struct sockaddr_tipc listen_addr;
        listen_addr.family = AF_TIPC;
        listen_addr.addrtype = TIPC_ADDR_NAMESEQ;
        listen_addr.addr.nameseq.type = 226;
        listen_addr.addr.nameseq.lower = addr;
        listen_addr.addr.nameseq.upper = addr;
        listen_addr.scope = TIPC_CLUSTER_SCOPE;

        if (-1 == bind(sock, (struct sockaddr*)&listen_addr, sizeof(struct 
sockaddr_tipc)))
            perror("Error opening TIPC socket");

        *(int*)buf = addr;
        int rc = sendto(sock, buf, sendsize, 0, (struct sockaddr*)&to_addr, 
sizeof(to_addr));

        printf("tipc send rc = %d\n", rc);
        if (rc < 0)
            perror("send err");

        rc = recvfrom(sock, buf, 65535, 0, NULL, 0);
        c++;
        if (rc < 0)
            perror("send err");

        printf("Received %d\n", c);

        close(sock);
    }
}

Here is the server side:
#include <stdio.h>
#include <stdlib.h>
#include <sys/socket.h>
#include <linux/tipc.h>

int main(int argc, char * argv[])
{
    int addr = 2;

    int sock = socket(AF_TIPC, SOCK_RDM | SOCK_CLOEXEC, 0);
    if (sock == -1)
        perror("opening socket");
    struct sockaddr_tipc listen_addr;
    listen_addr.family = AF_TIPC;
    listen_addr.addrtype = TIPC_ADDR_NAMESEQ;
    listen_addr.addr.nameseq.type = 226;
    listen_addr.addr.nameseq.lower = 2;
    listen_addr.addr.nameseq.upper = 2;
    listen_addr.scope = TIPC_CLUSTER_SCOPE;

    if (-1 == bind(sock, (struct sockaddr*)&listen_addr, sizeof(struct 
sockaddr_tipc)))
        perror("Error opening TIPC socket");

    char buf[65535];

    int c = 0;
    while( true )
    {
        int ret = 0;
        ret = recvfrom(sock, buf, 65535, 0, NULL, 0);
        c++;

        if (ret > 0)
        {
            printf("Received %d\n", c);

            // get return instance
            addr = *(int*)buf;
            struct sockaddr_tipc to_addr;

            to_addr.family = AF_TIPC;
            to_addr.scope = 0;
            to_addr.addrtype = TIPC_ADDR_NAME;
            to_addr.addr.name.name.type = 226;
            to_addr.addr.name.name.instance = addr;
            to_addr.addr.name.domain = 0;

            sendto(sock, buf, ret, 0, (struct sockaddr*)&to_addr, 
sizeof(to_addr));
        }
    }
}


_______________________________________________
tipc-discussion mailing list
tipc-discussion@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/tipc-discussion

Reply via email to