Dear all,

we have a very strange problem concerning TCP connections: once in a while a 
new connect takes exactly 3 seconds to get established. We discovered this 
problem with connects to a mysql server but it is reproducable with a few lines 
of C code:

/* START test program */
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <unistd.h>
#include <fcntl.h>
#include <signal.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <sys/wait.h>
#include <netinet/tcp.h>
#include <netinet/in.h>

main() {

   /* edit Server and Port */
   char *server = "192.168.1.1";
   int port = 3306;

   int sock;
   int len;
   int flag;
   struct sockaddr_in sock_addr;
   int count = 0;
   struct timeval tim;

   while(1) {

      printf("%d\n", count);

      gettimeofday(&tim, NULL);
      double t1=tim.tv_sec+(tim.tv_usec/1000000.0);

      if ((sock = socket(PF_INET, SOCK_STREAM, IPPROTO_IP)) < 0) {
         fprintf(stderr, "ERROR: cannot create socket\n");
         exit(1);
      }

      fcntl(sock, F_SETFL, O_RDONLY);
      fcntl(sock, F_GETFL);

      memset (&sock_addr, 0, sizeof (sock_addr));
      sock_addr.sin_family = AF_INET;
      sock_addr.sin_addr.s_addr = inet_addr (server);
      sock_addr.sin_port = htons (port);
      len = sizeof(sock_addr);

      if (connect(sock, (struct sockaddr *)&sock_addr, len) < 0) {
         fprintf(stderr, "ERROR: cannot connect to %s\n", server);
         exit(1);
      }

      flag = 8;
      setsockopt(sock, SOL_IP, IP_TOS, (char *) &flag, sizeof(int));
      flag = 1;
      setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *) &flag, sizeof(int));
      setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *) &flag, sizeof(int));

      shutdown(sock, SHUT_RDWR);

      close(sock);

      gettimeofday(&tim, NULL);
      double t2=tim.tv_sec+(tim.tv_usec/1000000.0);
      if ((t2-t1) > 1) {
         printf("%s: %.6lf seconds elapsed\n", server, t2-t1);
      }

      count++;

   }

}
/* END test program*/

Just compile the code on a client (first of all edit the IP address and port of 
the server), open a port on the server (e.g. with netcat: nc -k -l <PORT>) and 
run the binary against it. The program will terminate with the error message 
"ERROR: cannot connect to 192.168.1.1" but when you start it again and again 
you will get the described behavior after some time (message "192.168.1.1: 
3.000903 seconds elapsed").

We have traced the client using strace with the following result (first number 
is the time taken for the system call):

     0.001620 gettimeofday({1202125307, 657853}, NULL) = 0
     0.000132 socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 3
     0.000110 fcntl64(3, F_SETFL, O_RDONLY) = 0
     0.000076 fcntl64(3, F_GETFL)       = 0x2 (flags O_RDWR)
     0.000079 connect(3, {sa_family=AF_INET, sin_port=htons(3306), 
sin_addr=inet_addr("192.168.1.1")}, 16) = 0
     2.999904 setsockopt(3, SOL_IP, IP_TOS, [8], 4) = 0
     0.000099 setsockopt(3, SOL_TCP, TCP_NODELAY, [1], 4) = 0
     0.000076 setsockopt(3, SOL_SOCKET, SO_KEEPALIVE, [1], 4) = 0
     0.000072 shutdown(3, 2 /* send and receive */) = 0
     0.000150 close(3)                  = 0
     0.000245 gettimeofday({1202125310, 658756}, NULL) = 0
     0.000089 write(1, "192.168.1.1: 3.000903 seconds e"..., 39) = 39

As you can see the first setsockopt takes 3 seconds. In order to eliminate the 
setsockopt call as causer of the problem we have made the connect after all 
setsockopts with the following result:

     0.000225 gettimeofday({1202123697, 552194}, NULL) = 0
     0.000076 socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 3
     0.000110 fcntl64(3, F_SETFL, O_RDONLY) = 0
     0.000077 fcntl64(3, F_GETFL)       = 0x2 (flags O_RDWR)
     0.000088 setsockopt(3, SOL_IP, IP_TOS, [8], 4) = 0
     0.000083 setsockopt(3, SOL_TCP, TCP_NODELAY, [1], 4) = 0
     0.000103 setsockopt(3, SOL_SOCKET, SO_KEEPALIVE, [1], 4) = 0
     0.000073 connect(3, {sa_family=AF_INET, sin_port=htons(3306), 
sin_addr=inet_addr("192.168.1.1")}, 16) = 0
     3.000288 shutdown(3, 2 /* send and receive */) = 0
     0.000199 close(3)                  = 0
     0.000236 gettimeofday({1202123700, 553527}, NULL) = 0
     0.000090 write(1, "192.168.1.1: 3.001333 seconds e"..., 39) = 39

It looks like the connect call itself is not the problem rather then the first 
access to the socket after the connect. Perhaps the "real" connect (TCP SYN 
paket) is not made until the first access to the socket happens (regardless if 
this first access is an setsockopt or shutdown).

When you look at the TCP pakets transmitted between server an client you will 
see that the client sends a SYN and the server answers with a SYN/ACK. However 
the client will never receive the SYN/ACK. Therefore the client resends a SYN 
after exactly 3 seconds. The server answers with a SYN/ACK again and this time 
the client receives the paket. Now the connection is working.

tcpdump from the client:

11:24:44.045735 IP 192.168.1.99.39028 > 192.168.1.1.3306: S 
3068401272:3068401272(0) win 5840 <mss 1460,sackOK,timestamp 10592352 
0,nop,wscale 6>
11:24:47.045710 IP 192.168.1.99.39028 > 192.168.1.1.3306: S 
3068401272:3068401272(0) win 5840 <mss 1460,sackOK,timestamp 10595352 
0,nop,wscale 6>
11:24:47.045911 IP 192.168.1.1.3306 > 192.168.1.99.39028: S 
3052139931:3052139931(0) ack 3068401273 win 5792 <mss 1460,sackOK,timestamp 
1048187 10592352,nop,wscale 7>
11:24:47.046015 IP 192.168.1.99.39028 > 192.168.1.1.3306: . ack 1 win 92 
<nop,nop,timestamp 10595352 1048187>
11:24:47.046541 IP 192.168.1.99.39028 > 192.168.1.1.3306: F 1:1(0) ack 1 win 92 
<nop,nop,timestamp 10595352 1048187>
11:24:47.046845 IP 192.168.1.1.3306 > 192.168.1.99.39028: F 1:1(0) ack 2 win 46 
<nop,nop,timestamp 1048187 10595352>
11:24:47.046880 IP 192.168.1.99.39028 > 192.168.1.1.3306: . ack 2 win 92 
<nop,nop,timestamp 10595353 1048187>

tcpdump from the server:

11:24:44.226373 IP 192.168.1.99.39028 > 192.168.1.1.3306: S 
3068401272:3068401272(0) win 5840 <mss 1460,sackOK,timestamp 10592352 
0,nop,wscal e 6>
11:24:44.226389 IP 192.168.1.1.3306 > 192.168.1.99.39028: S 
3052139931:3052139931(0) ack 3068401273 win 5792 <mss 1460,sackOK,timestamp 
10478 87 10592352,nop,wscale 7>
11:24:47.226673 IP 192.168.1.99.39028 > 192.168.1.1.3306: S 
3068401272:3068401272(0) win 5840 <mss 1460,sackOK,timestamp 10595352 
0,nop,wscale 6>
11:24:47.226715 IP 192.168.1.1.3306 > 192.168.1.99.39028: S 
3052139931:3052139931(0) ack 3068401273 win 5792 <mss 1460,sackOK,timestamp 
1048187 10592352,nop,wscale 7>
11:24:47.226957 IP 192.168.1.99.39028 > 192.168.1.1.3306: . ack 1 win 92 
<nop,nop,timestamp 10595352 1048187>
11:24:47.227488 IP 192.168.1.99.39028 > 192.168.1.1.3306: F 1:1(0) ack 1 win 92 
<nop,nop,timestamp 10595352 1048187>
11:24:47.227580 IP 192.168.1.1.3306 > 192.168.1.99.39028: F 1:1(0) ack 2 win 46 
<nop,nop,timestamp 1048187 10595352>
11:24:47.227810 IP 192.168.1.99.39028 > 192.168.1.1.3306: . ack 2 win 92 
<nop,nop,timestamp 10595353 1048187>

The interesting part: when the delay of 3 seconds occurs then every new 
connection has this problem. So it can't be a missing TCP paket due to a faulty 
switch or something else. Furthermore the problem can be reproduced also 
locally on the same machine (-> lo and not ether) without any network stuff 
(cables, switches, ...). In addition the kernel version seems to be relevant. 
There are kernels where this problem occures very often (e.g. kernel 2.6.23.8) 
and versions where it's only sporadically apparent (e.g. kernel 2.6.20.3 or 
kernel 2.6.23.1). We have tested with standard Fedora kernels as well as with 
"original" kernels from kernel.org but there seems to be no kernel WITHOUT this 
behavior (it's a general issue regardless of the kernel version). We have also 
tested with different hardware (especially different network cards with 
different drivers -> tg3, e1000) without success.

Has somebody else seen this problem before and can tell us how to fix it? We 
haven't found anything on the web. Are there any sysctl parameters or something 
like that? Is it really a kernel problem/bug (seems so) or perhaps a feature?

Any help would be appreciated. At the moment we have nasty hangs in our 
application due to the "3 second connects".

Kind regards,
Leo

-- 
Psssst! Schon vom neuen GMX MultiMessenger gehört?
Der kann`s mit allen: http://www.gmx.net/de/go/multimessenger
-
To unsubscribe from this list: send the line "unsubscribe linux-net" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to