Hi. We recently upgraded an appliance we have from Ubuntu 18.04 to 22.04, and
are now seeing intermittent issues with RDM message.
(Kernel 5.15.0-75)
This issue was never seen on Ubuntu 18.04 and older (Ubuntu kernel 4.15.0)
I have a minimal test program set that will have the issue seen.
Basically we have a client side that opens a semi-random tipc-instance port,
sends a message to a server on a known address.
Part of the message is the client instance address, so the server can send back
a reply.
Occasionally the server reply is lost.
We've tracked it down to that occasionally the publish message trails the
actual message on the server side. When that happens, apparently the server
tipc stack is dropping the message.
This ONLY happens when client and server are on two different pieces of
hardware (fairly beefy supermicro server with 2 Xeon 5110 or Silver)
Here is a TCPDump of the sequence where it fails:
1 2023-06-20 16:23:09.761711 1.1.1 0.0.0 TIPC 74
Name Dist Publication type:226 inst:19712
2 2023-06-20 16:23:09.761758 1.1.1 1.1.2 TIPC 105
Payld:Low NamedMsg type:226 inst:2
3 2023-06-20 16:23:09.761770 1.1.2 1.1.1 TIPC 105
Payld:Low NamedMsg type:226 inst:19712
4 2023-06-20 16:23:09.761906 1.1.1 0.0.0 TIPC 74
Name Dist Withdrawal type:226 inst:19712
5 2023-06-20 16:23:09.761906 1.1.1 0.0.0 TIPC 74
Name Dist Publication type:226 inst:19968
6 2023-06-20 16:23:09.761954 1.1.1 1.1.2 TIPC 105
Payld:Low NamedMsg type:226 inst:2
7 2023-06-20 16:23:09.761965 1.1.2 1.1.1 TIPC 105
Payld:Low NamedMsg type:226 inst:19968
8 2023-06-20 16:23:09.762054 1.1.1 0.0.0 TIPC 74
Name Dist Withdrawal type:226 inst:19968
9 2023-06-20 16:23:09.762054 1.1.1 0.0.0 TIPC 74
Name Dist Publication type:226 inst:20224
10 2023-06-20 16:23:09.762101 1.1.1 1.1.2 TIPC 105
Payld:Low NamedMsg type:226 inst:2
11 2023-06-20 16:23:09.762112 1.1.2 1.1.1 TIPC 105
Payld:Low NamedMsg type:226 inst:20224
12 2023-06-20 16:23:09.762250 1.1.1 0.0.0 TIPC 74
Name Dist Withdrawal type:226 inst:20224
13 2023-06-20 16:23:09.762250 1.1.1 0.0.0 TIPC 74
Name Dist Publication type:226 inst:20480
14 2023-06-20 16:23:09.762250 1.1.1 1.1.2 TIPC 105
Payld:Low NamedMsg type:226 inst:2
15 2023-06-20 16:23:09.762254 1.1.2 1.1.1 TIPC 58
Link State State
16 2023-06-20 16:23:09.762267 1.1.2 1.1.1 TIPC 105
Payld:Low NamedMsg type:226 inst:20480
17 2023-06-20 16:23:09.762396 1.1.1 0.0.0 TIPC 74
Name Dist Withdrawal type:226 inst:20480
18 2023-06-20 16:23:09.762396 1.1.1 0.0.0 TIPC 74
Name Dist Publication type:226 inst:20736
19 2023-06-20 16:23:09.762397 1.1.1 1.1.2 TIPC 105
Payld:Low NamedMsg type:226 inst:2
20 2023-06-20 16:23:09.762410 1.1.2 1.1.1 TIPC 105
Payld:Low NamedMsg type:226 inst:20736
21 2023-06-20 16:23:09.762592 1.1.1 1.1.2 TIPC 105
Payld:Low NamedMsg type:226 inst:2
22 2023-06-20 16:23:09.762656 1.1.1 0.0.0 TIPC 74
Name Dist Withdrawal type:226 inst:20736
23 2023-06-20 16:23:09.762656 1.1.1 0.0.0 TIPC 74
Name Dist Publication type:226 inst:20992
Packet 23 is the publish, while 21 is the payload.
I would have assumed (and I think the older tipc driver did so) that a datagram
received from a source that is not yet published would update the nametable too?
Test programs will usually recreate within 50-100 iterations
Here is the client side that will recreate:
#include <stdio.h>
#include <stdlib.h>
#include <sys/socket.h>
#include <linux/tipc.h>
#include <string.h>
#include <unistd.h>
int main(int argc, char * argv[])
{
int sendsize = 51;
char buf[65535];
memset(buf, 0, sendsize);
int c = 0;
struct sockaddr_tipc to_addr;
to_addr.family = AF_TIPC;
to_addr.scope = 0;
to_addr.addrtype = TIPC_ADDR_NAME;
to_addr.addr.name.name.type = 226;
to_addr.addr.name.name.instance = 2;
to_addr.addr.name.domain = 0;
for (int i = 0; i < 1000; i++)
{
int addr = (i & 0xff) << 8;
int sock = socket(AF_TIPC, SOCK_RDM | SOCK_CLOEXEC, 0);
if (sock == -1)
perror("opening socket");
struct sockaddr_tipc listen_addr;
listen_addr.family = AF_TIPC;
listen_addr.addrtype = TIPC_ADDR_NAMESEQ;
listen_addr.addr.nameseq.type = 226;
listen_addr.addr.nameseq.lower = addr;
listen_addr.addr.nameseq.upper = addr;
listen_addr.scope = TIPC_CLUSTER_SCOPE;
if (-1 == bind(sock, (struct sockaddr*)&listen_addr, sizeof(struct
sockaddr_tipc)))
perror("Error opening TIPC socket");
*(int*)buf = addr;
int rc = sendto(sock, buf, sendsize, 0, (struct sockaddr*)&to_addr,
sizeof(to_addr));
printf("tipc send rc = %d\n", rc);
if (rc < 0)
perror("send err");
rc = recvfrom(sock, buf, 65535, 0, NULL, 0);
c++;
if (rc < 0)
perror("send err");
printf("Received %d\n", c);
close(sock);
}
}
Here is the server side:
#include <stdio.h>
#include <stdlib.h>
#include <sys/socket.h>
#include <linux/tipc.h>
int main(int argc, char * argv[])
{
int addr = 2;
int sock = socket(AF_TIPC, SOCK_RDM | SOCK_CLOEXEC, 0);
if (sock == -1)
perror("opening socket");
struct sockaddr_tipc listen_addr;
listen_addr.family = AF_TIPC;
listen_addr.addrtype = TIPC_ADDR_NAMESEQ;
listen_addr.addr.nameseq.type = 226;
listen_addr.addr.nameseq.lower = 2;
listen_addr.addr.nameseq.upper = 2;
listen_addr.scope = TIPC_CLUSTER_SCOPE;
if (-1 == bind(sock, (struct sockaddr*)&listen_addr, sizeof(struct
sockaddr_tipc)))
perror("Error opening TIPC socket");
char buf[65535];
int c = 0;
while( true )
{
int ret = 0;
ret = recvfrom(sock, buf, 65535, 0, NULL, 0);
c++;
if (ret > 0)
{
printf("Received %d\n", c);
// get return instance
addr = *(int*)buf;
struct sockaddr_tipc to_addr;
to_addr.family = AF_TIPC;
to_addr.scope = 0;
to_addr.addrtype = TIPC_ADDR_NAME;
to_addr.addr.name.name.type = 226;
to_addr.addr.name.name.instance = addr;
to_addr.addr.name.domain = 0;
sendto(sock, buf, ret, 0, (struct sockaddr*)&to_addr,
sizeof(to_addr));
}
}
}
_______________________________________________
tipc-discussion mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/tipc-discussion