Update: dmtcp-2.6.1~rc1 on centos7 gives several diagnostics if I try to take a checkpoint after sockets are connected.
[40000] WARNING at socketconnection.cpp:568 in recvPeerInformation; REASON='JWARNING(false) failed' _fds[0] = 4 _localInode = 0 _remoteInode = 0 Message: DMTCP detected an "external" connect socket.The socket will be restored as a dead socket. Try searching for the "external" process with _remoteInode using "netstat -pae | grep <_remoteInode>" or "ss -axp | grep <_remoteInode>". ... [40000] WARNING at kernelbufferdrainer.cpp:125 in onTimeoutInterval; REASON='JWARNING(false) failed' _dataSockets[i]->socket().sockfd() = 6 buffer.size() = 0 WARN_INTERVAL_SEC = 10 derek.bea...@synopsys.com<mailto:derek.bea...@synopsys.com> ZeBu R&D team, Synopsys, Austin, Texas (CDT, UTC-05:00) Urgent? Please phone<tel:+15126511517> 61517 +15126511517 From: Derek Beatty via Dmtcp-forum <dmtcp-forum@lists.sourceforge.net> Sent: Thursday, October 22, 2020 11:04 AM To: dmtcp-forum@lists.sourceforge.net Subject: [Dmtcp-forum] SCTP unsupported? I’m guessing, from lack of comments in the code and failure on the following small test program, that SCTP even in basic form is unsupported. To observe the failure, compile the code below and run with e.g. dmtcp_launch ./a.out 1500 5 5 and do a dmtcp_command -bc in a separate shell. The program waits 4 seconds before connecting any sockets. Behavior varies with when during execution I take the checkpoint. If I take it before any sockets are connected, then a restart works fine. If I take it after sockets are connected, restart fails: [40000] ERROR at connectionrewirer.cpp:114 in doReconnect; REASON='JASSERT(_real_connect(fd, (sockaddr*) &remoteAddr.addr, remoteAddr.len) == 0) failed' id = 24aa32f81b91247f-40000-48aee2af7b128f(99007) (strerror((*__errno_location ()))) = Connection refused Message: failed to restore connection a.out (40000): Terminating... Derek Beatty dbea...@acm.org<mailto:dbea...@acm.org> // Test program for SCTP communication // g++ -ggdb sctp-ipv4-test.cc<https://urldefense.com/v3/__http:/sctp-ipv4-test.cc__;!!A4F2R9G_pg!LT0cjbaoprlIBIOMpUtiXMrNjbSM1H6e37VtVTQehF71_ke0PZlEsZ6ZQNjcZrKX$> -pthread #include <iostream> #include <string> #include <thread> #include <vector> #include <netdb.h> #include <poll.h> #include <stdio.h> #include <string.h> #include <unistd.h> #include <netinet/in.h> #include <sys/types.h> #include <sys/socket.h> int port; char* portStr; int numLoops; int numClients; using namespace std; void log(string s) { clog << s << endl; } void die(string s) { string why; if (errno) why = string(": ") + strerror(errno); clog << "error: " << s << why << endl; exit(1); } void worker(int w) { log("start worker " +to_string(w)); sleep(4); int sock= socket(AF_INET, SOCK_STREAM, IPPROTO_SCTP); if (sock<0) die("worker socket"); log("worker " +to_string(w)+ " fd "+to_string(sock)); addrinfo hints = { .ai_flags = 0, .ai_family = AF_UNSPEC, .ai_socktype = SOCK_STREAM, .ai_protocol = IPPROTO_SCTP, .ai_addrlen = 0, .ai_addr = NULL, .ai_canonname = NULL, .ai_next = NULL, }; addrinfo *result; int errcode = getaddrinfo("localhost", portStr, &hints, &result); if (errcode) { die(string("worker getaddrinfo: ") + gai_strerror(errcode)); } if (connect(sock,result->ai_addr,result->ai_addrlen)<0) die("worker connect"); log("worker "+to_string(w)+" connected"); for(int i= 0; i < numLoops; i++) { sleep(1); auto s= "w:" +to_string(w) + " i:" + to_string(i); if (send(sock, s.c_str(),1+s.size(),MSG_NOSIGNAL)<0) die("worker send"); log("sent "+s); } if (shutdown(sock,SHUT_RDWR)<0) die("worker shutdown"); if (close(sock)<0) die("worker close"); log("end worker "+to_string(w)); } int main(int argc, char* argv[]) { try { if (argc != 4) die("usage: sctp-test port numLoops numClients"); port= atoi((portStr= argv[1])); if (port<1024) die("bad port"); numLoops= atoi(argv[2]); if (numLoops<1) die("bad numLoops"); numClients= atoi(argv[3]); if (numClients<1) die("bad numClients"); vector<thread> workers; for (int w=0; w < numClients; w++) { workers.emplace_back([w](){ worker(w); }); } int sock= socket(AF_INET, SOCK_STREAM|SOCK_NONBLOCK, IPPROTO_SCTP); if (sock<0) die("master socket"); log("master fd " + to_string(sock)); sockaddr_in addr = { .sin_family = AF_INET, .sin_port = htons(port), .sin_addr = { INADDR_ANY }, }; if (bind(sock, (sockaddr *)&addr, sizeof addr) < 0) die("bind"); if (listen(sock, numClients) <0) die("listen"); vector<pollfd> pollvec; pollvec.push_back({sock, POLLIN, 0}); sleep(3); log("start polling"); bool connected= false; while (pollvec.size()>1 || !connected) { if (poll(pollvec.data(), pollvec.size(), -1) < 0) die("poll"); for (auto pfd = pollvec.begin(); pfd != pollvec.end(); ++pfd) { if (!pfd->revents) continue; if (pfd->fd == sock) { sockaddr_in addr; socklen_t addr_len = sizeof addr; int connected_fd = accept(sock, (sockaddr *)&addr, &addr_len); if (connected_fd < 0) { if (errno==EAGAIN||errno==EWOULDBLOCK) break; die("accept"); } log("connected fd " + to_string(connected_fd)); pollvec.push_back({connected_fd, POLLIN, 0}); connected= true; break; } else { char data[16388 + 6]; ssize_t len = recv(pfd->fd, data, sizeof data, 0); if (len<0) die("recv fd " +to_string(pfd->fd)); if (len == 0) { log("disconnected fd " + to_string(pfd->fd)); if (close(pfd->fd)<0) die("master close"); pollvec.erase(pfd); break; } char* p= data; log(string("got ")+p); } } } log("end polling"); for (auto& w: workers) { w.join(); } log("end joining"); } catch (exception& e) { die(e.what()); } return 0; }
_______________________________________________ Dmtcp-forum mailing list Dmtcp-forum@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/dmtcp-forum