On 2016/3/30 21:47, Eric Dumazet wrote:
On Wed, 2016-03-30 at 13:56 +0800, Yang Yingliang wrote:

Sorry, I made a mistake. I am very sure my kernel has these two patches.
And I can get some dropping of the packets in 10Gb eth.

# netstat -s | grep -i backlog
      TCPBacklogDrop: 4135
# netstat -s | grep -i backlog
      TCPBacklogDrop: 4167

Sender will retransmit and the receiver backlog will lilely be emptied
before the packets arrive again.

Are you sure these are TCP drops ?
Yes.


Which 10Gb NIC is it ? (ethtool -i eth0)
The NIC driver is not upstream. And my system is arm64.


What is the max size of sendmsg() chunks are generated by your apps ?
256KB


Are they forcing small SO_RCVBUF or SO_SNDBUF ?
I am not sure.
I add some debug message in kernel:
[2016-04-06 10:56:55][ 1365.477140] TCP: rcvbuf:10485760 sndbuf:2097152 limit:12582912 backloglen:12402232 rmem_alloc:0 truesize:53320 [2016-04-06 10:56:55][ 1365.477170] TCP: rcvbuf:10485760 sndbuf:2097152 limit:12582912 backloglen:12460884 rmem_alloc:55986 truesize:58652 [2016-04-06 10:56:55][ 1365.477192] TCP: rcvbuf:10485760 sndbuf:2097152 limit:12582912 backloglen:12506206 rmem_alloc:0 truesize:45322 [2016-04-06 10:56:55][ 1365.477226] TCP: rcvbuf:10485760 sndbuf:2097152 limit:12582912 backloglen:12519536 rmem_alloc:7998 truesize:13330 [2016-04-06 10:56:55][ 1365.477254] TCP: rcvbuf:10485760 sndbuf:2097152 limit:12582912 backloglen:12575522 rmem_alloc:0 truesize:55986 [2016-04-06 10:56:55][ 1365.477282] TCP: rcvbuf:10485760 sndbuf:2097152 limit:12582912 backloglen:12634174 rmem_alloc:0 truesize:58652 [2016-04-06 10:56:55][ 1365.477301] TCP: rcvbuf:10485760 sndbuf:2097152 limit:12582912 backloglen:12634174 rmem_alloc:26660 truesize:31992 [2016-04-06 10:56:55][ 1365.477321] TCP: rcvbuf:10485760 sndbuf:2097152 limit:12582912 backloglen:12634174 rmem_alloc:58652 truesize:26660 [2016-04-06 10:56:55][ 1365.477341] TCP: rcvbuf:10485760 sndbuf:2097152 limit:12582912 backloglen:12634174 rmem_alloc:58652 truesize:42656 [2016-04-06 10:56:55][ 1365.477384] TCP: rcvbuf:10485760 sndbuf:2097152 limit:12582912 backloglen:12634174 rmem_alloc:0 truesize:58652 [2016-04-06 10:56:55][ 1365.477403] TCP: rcvbuf:10485760 sndbuf:2097152 limit:12582912 backloglen:12634174 rmem_alloc:0 truesize:34658


What percentage of drops do you have ?
netstat -s | grep -i TCPBacklogDrop increases 20-40 per second.
It's about 1.2% (117724(TCPBacklogDrop)/214502873(InSegs of cat /proc/net/snmp)).


Here (at Google), we have less than one backlog drop per billion
packets, on host facing the public Internet.

If a TCP sender sends a burst of tiny packets because it is misbehaving,
you absolutely will drop packets, especially if applications use
sendmsg() with very big lengths and big SO_SNDBUF.

Trying to not drop these hostile packets as you did is simply opening
your host to DOS attacks.

Eventually, we should even drop earlier in TCP stack (before taking
socket lock).


How about expand the buffer like:

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6d204f3..da1bc16 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -281,6 +281,7 @@ extern unsigned int sysctl_tcp_notsent_lowat;
 extern int sysctl_tcp_min_tso_segs;
 extern int sysctl_tcp_autocorking;
 extern int sysctl_tcp_invalid_ratelimit;
+extern int sysctl_tcp_backlog_buf_multi;

 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index f0e8297..9511410 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -631,6 +631,13 @@ static struct ctl_table ipv4_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec
        },
+       {
+               .procname       = "tcp_backlog_buf_multi",
+               .data           = &sysctl_tcp_backlog_buf_multi,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
 #ifdef CONFIG_NETLABEL
        {
                .procname       = "cipso_cache_enable",
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 87463c8..337ad55 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -101,6 +101,8 @@ int sysctl_tcp_thin_dupack __read_mostly;
 int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
 int sysctl_tcp_early_retrans __read_mostly = 3;
 int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
+int sysctl_tcp_backlog_buf_multi __read_mostly = 1;
+EXPORT_SYMBOL(sysctl_tcp_backlog_buf_multi);

 #define FLAG_DATA              0x01 /* Incoming frame contained data.          
*/
 #define FLAG_WIN_UPDATE                0x02 /* Incoming ACK was a window 
update.       */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 13b92d5..39272f3 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1635,7 +1635,8 @@ process:
                if (!tcp_prequeue(sk, skb))
                        ret = tcp_v4_do_rcv(sk, skb);
        } else if (unlikely(sk_add_backlog(sk, skb,
-                                          sk->sk_rcvbuf + sk->sk_sndbuf))) {
+                                          (sk->sk_rcvbuf + sk->sk_sndbuf) *
+                                          sysctl_tcp_backlog_buf_multi))) {
                bh_unlock_sock(sk);
                NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
                goto discard_and_relse;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index c1147ac..1e8f709 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1433,7 +1433,8 @@ process:
                if (!tcp_prequeue(sk, skb))
                        ret = tcp_v6_do_rcv(sk, skb);
        } else if (unlikely(sk_add_backlog(sk, skb,
-                                          sk->sk_rcvbuf + sk->sk_sndbuf))) {
+                                          (sk->sk_rcvbuf + sk->sk_sndbuf) *
+                                          sysctl_tcp_backlog_buf_multi))) {
                bh_unlock_sock(sk);
                NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
                goto discard_and_relse;
--


Reply via email to