Hi. We recently upgraded an appliance we have from Ubuntu 18.04 to 22.04, and are now seeing intermittent issues with RDM message. (Kernel 5.15.0-75) This issue was never seen on Ubuntu 18.04 and older (Ubuntu kernel 4.15.0)
I have a minimal test program set that will have the issue seen. Basically we have a client side that opens a semi-random tipc-instance port, sends a message to a server on a known address. Part of the message is the client instance address, so the server can send back a reply. Occasionally the server reply is lost. We've tracked it down to that occasionally the publish message trails the actual message on the server side. When that happens, apparently the server tipc stack is dropping the message. This ONLY happens when client and server are on two different pieces of hardware (fairly beefy supermicro server with 2 Xeon 5110 or Silver) Here is a TCPDump of the sequence where it fails: 1 2023-06-20 16:23:09.761711 1.1.1 0.0.0 TIPC 74 Name Dist Publication type:226 inst:19712 2 2023-06-20 16:23:09.761758 1.1.1 1.1.2 TIPC 105 Payld:Low NamedMsg type:226 inst:2 3 2023-06-20 16:23:09.761770 1.1.2 1.1.1 TIPC 105 Payld:Low NamedMsg type:226 inst:19712 4 2023-06-20 16:23:09.761906 1.1.1 0.0.0 TIPC 74 Name Dist Withdrawal type:226 inst:19712 5 2023-06-20 16:23:09.761906 1.1.1 0.0.0 TIPC 74 Name Dist Publication type:226 inst:19968 6 2023-06-20 16:23:09.761954 1.1.1 1.1.2 TIPC 105 Payld:Low NamedMsg type:226 inst:2 7 2023-06-20 16:23:09.761965 1.1.2 1.1.1 TIPC 105 Payld:Low NamedMsg type:226 inst:19968 8 2023-06-20 16:23:09.762054 1.1.1 0.0.0 TIPC 74 Name Dist Withdrawal type:226 inst:19968 9 2023-06-20 16:23:09.762054 1.1.1 0.0.0 TIPC 74 Name Dist Publication type:226 inst:20224 10 2023-06-20 16:23:09.762101 1.1.1 1.1.2 TIPC 105 Payld:Low NamedMsg type:226 inst:2 11 2023-06-20 16:23:09.762112 1.1.2 1.1.1 TIPC 105 Payld:Low NamedMsg type:226 inst:20224 12 2023-06-20 16:23:09.762250 1.1.1 0.0.0 TIPC 74 Name Dist Withdrawal type:226 inst:20224 13 2023-06-20 16:23:09.762250 1.1.1 0.0.0 TIPC 74 Name Dist Publication type:226 inst:20480 14 2023-06-20 16:23:09.762250 1.1.1 1.1.2 TIPC 105 Payld:Low NamedMsg type:226 inst:2 15 2023-06-20 16:23:09.762254 1.1.2 1.1.1 TIPC 58 Link State State 16 2023-06-20 16:23:09.762267 1.1.2 1.1.1 TIPC 105 Payld:Low NamedMsg type:226 inst:20480 17 2023-06-20 16:23:09.762396 1.1.1 0.0.0 TIPC 74 Name Dist Withdrawal type:226 inst:20480 18 2023-06-20 16:23:09.762396 1.1.1 0.0.0 TIPC 74 Name Dist Publication type:226 inst:20736 19 2023-06-20 16:23:09.762397 1.1.1 1.1.2 TIPC 105 Payld:Low NamedMsg type:226 inst:2 20 2023-06-20 16:23:09.762410 1.1.2 1.1.1 TIPC 105 Payld:Low NamedMsg type:226 inst:20736 21 2023-06-20 16:23:09.762592 1.1.1 1.1.2 TIPC 105 Payld:Low NamedMsg type:226 inst:2 22 2023-06-20 16:23:09.762656 1.1.1 0.0.0 TIPC 74 Name Dist Withdrawal type:226 inst:20736 23 2023-06-20 16:23:09.762656 1.1.1 0.0.0 TIPC 74 Name Dist Publication type:226 inst:20992 Packet 23 is the publish, while 21 is the payload. I would have assumed (and I think the older tipc driver did so) that a datagram received from a source that is not yet published would update the nametable too? Test programs will usually recreate within 50-100 iterations Here is the client side that will recreate: #include <stdio.h> #include <stdlib.h> #include <sys/socket.h> #include <linux/tipc.h> #include <string.h> #include <unistd.h> int main(int argc, char * argv[]) { int sendsize = 51; char buf[65535]; memset(buf, 0, sendsize); int c = 0; struct sockaddr_tipc to_addr; to_addr.family = AF_TIPC; to_addr.scope = 0; to_addr.addrtype = TIPC_ADDR_NAME; to_addr.addr.name.name.type = 226; to_addr.addr.name.name.instance = 2; to_addr.addr.name.domain = 0; for (int i = 0; i < 1000; i++) { int addr = (i & 0xff) << 8; int sock = socket(AF_TIPC, SOCK_RDM | SOCK_CLOEXEC, 0); if (sock == -1) perror("opening socket"); struct sockaddr_tipc listen_addr; listen_addr.family = AF_TIPC; listen_addr.addrtype = TIPC_ADDR_NAMESEQ; listen_addr.addr.nameseq.type = 226; listen_addr.addr.nameseq.lower = addr; listen_addr.addr.nameseq.upper = addr; listen_addr.scope = TIPC_CLUSTER_SCOPE; if (-1 == bind(sock, (struct sockaddr*)&listen_addr, sizeof(struct sockaddr_tipc))) perror("Error opening TIPC socket"); *(int*)buf = addr; int rc = sendto(sock, buf, sendsize, 0, (struct sockaddr*)&to_addr, sizeof(to_addr)); printf("tipc send rc = %d\n", rc); if (rc < 0) perror("send err"); rc = recvfrom(sock, buf, 65535, 0, NULL, 0); c++; if (rc < 0) perror("send err"); printf("Received %d\n", c); close(sock); } } Here is the server side: #include <stdio.h> #include <stdlib.h> #include <sys/socket.h> #include <linux/tipc.h> int main(int argc, char * argv[]) { int addr = 2; int sock = socket(AF_TIPC, SOCK_RDM | SOCK_CLOEXEC, 0); if (sock == -1) perror("opening socket"); struct sockaddr_tipc listen_addr; listen_addr.family = AF_TIPC; listen_addr.addrtype = TIPC_ADDR_NAMESEQ; listen_addr.addr.nameseq.type = 226; listen_addr.addr.nameseq.lower = 2; listen_addr.addr.nameseq.upper = 2; listen_addr.scope = TIPC_CLUSTER_SCOPE; if (-1 == bind(sock, (struct sockaddr*)&listen_addr, sizeof(struct sockaddr_tipc))) perror("Error opening TIPC socket"); char buf[65535]; int c = 0; while( true ) { int ret = 0; ret = recvfrom(sock, buf, 65535, 0, NULL, 0); c++; if (ret > 0) { printf("Received %d\n", c); // get return instance addr = *(int*)buf; struct sockaddr_tipc to_addr; to_addr.family = AF_TIPC; to_addr.scope = 0; to_addr.addrtype = TIPC_ADDR_NAME; to_addr.addr.name.name.type = 226; to_addr.addr.name.name.instance = addr; to_addr.addr.name.domain = 0; sendto(sock, buf, ret, 0, (struct sockaddr*)&to_addr, sizeof(to_addr)); } } } _______________________________________________ tipc-discussion mailing list tipc-discussion@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/tipc-discussion