Re: [PATCH net-next v2] tcp: Add mark for TIMEWAIT sockets

2018-05-10 Thread David Miller
From: Jon Maxwell 
Date: Thu, 10 May 2018 16:53:51 +1000

> This version has some suggestions by Eric Dumazet:
> 
> - Use a local variable for the mark in IPv6 instead of ctl_sk to avoid SMP 
> races. 
> - Use the more elegant "IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark"
> statement. 
> - Factorize code as sk_fullsock() check is not necessary.
> 
> Aidan McGurn from Openwave Mobility systems reported the following bug:
> 
> "Marked routing is broken on customer deployment. Its effects are large 
> increase in Uplink retransmissions caused by the client never receiving 
> the final ACK to their FINACK - this ACK misses the mark and routes out 
> of the incorrect route."
> 
> Currently marks are added to sk_buffs for replies when the "fwmark_reflect" 
> sysctl is enabled. But not for TW sockets that had sk->sk_mark set via 
> setsockopt(SO_MARK..).  
> 
> Fix this in IPv4/v6 by adding tw->tw_mark for TIME_WAIT sockets. Copy the the 
> original sk->sk_mark in __inet_twsk_hashdance() to the new tw->tw_mark 
> location. 
> Then progate this so that the skb gets sent with the correct mark. Do the 
> same 
> for resets. Give the "fwmark_reflect" sysctl precedence over sk->sk_mark so 
> that
> netfilter rules are still honored.
> 
> Signed-off-by: Jon Maxwell 

I'm surprised the lack of a mark in timewait sockets wasn't noticed earlier.

Applied, thank you.


Re: [PATCH net-next v2] tcp: Add mark for TIMEWAIT sockets

2018-05-10 Thread Eric Dumazet


On 05/09/2018 11:53 PM, Jon Maxwell wrote:
> This version has some suggestions by Eric Dumazet:
> 
> - Use a local variable for the mark in IPv6 instead of ctl_sk to avoid SMP 
> races. 
> - Use the more elegant "IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark"
> statement. 
> - Factorize code as sk_fullsock() check is not necessary.
> 
> Aidan McGurn from Openwave Mobility systems reported the following bug:
> 
> "Marked routing is broken on customer deployment. Its effects are large 
> increase in Uplink retransmissions caused by the client never receiving 
> the final ACK to their FINACK - this ACK misses the mark and routes out 
> of the incorrect route."
> 
> Currently marks are added to sk_buffs for replies when the "fwmark_reflect" 
> sysctl is enabled. But not for TW sockets that had sk->sk_mark set via 
> setsockopt(SO_MARK..).  
> 
> Fix this in IPv4/v6 by adding tw->tw_mark for TIME_WAIT sockets. Copy the the 
> original sk->sk_mark in __inet_twsk_hashdance() to the new tw->tw_mark 
> location. 
> Then progate this so that the skb gets sent with the correct mark. Do the 
> same 
> for resets. Give the "fwmark_reflect" sysctl precedence over sk->sk_mark so 
> that
> netfilter rules are still honored.
> 
> Signed-off-by: Jon Maxwell 

Reviewed-by: Eric Dumazet 

Thanks Jon.



[PATCH net-next v2] tcp: Add mark for TIMEWAIT sockets

2018-05-10 Thread Jon Maxwell
This version has some suggestions by Eric Dumazet:

- Use a local variable for the mark in IPv6 instead of ctl_sk to avoid SMP 
races. 
- Use the more elegant "IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark"
statement. 
- Factorize code as sk_fullsock() check is not necessary.

Aidan McGurn from Openwave Mobility systems reported the following bug:

"Marked routing is broken on customer deployment. Its effects are large 
increase in Uplink retransmissions caused by the client never receiving 
the final ACK to their FINACK - this ACK misses the mark and routes out 
of the incorrect route."

Currently marks are added to sk_buffs for replies when the "fwmark_reflect" 
sysctl is enabled. But not for TW sockets that had sk->sk_mark set via 
setsockopt(SO_MARK..).  

Fix this in IPv4/v6 by adding tw->tw_mark for TIME_WAIT sockets. Copy the the 
original sk->sk_mark in __inet_twsk_hashdance() to the new tw->tw_mark 
location. 
Then progate this so that the skb gets sent with the correct mark. Do the same 
for resets. Give the "fwmark_reflect" sysctl precedence over sk->sk_mark so that
netfilter rules are still honored.

Signed-off-by: Jon Maxwell 
---
 include/net/inet_timewait_sock.h |  1 +
 net/ipv4/ip_output.c |  2 +-
 net/ipv4/tcp_ipv4.c  | 16 ++--
 net/ipv4/tcp_minisocks.c |  1 +
 net/ipv6/tcp_ipv6.c  |  6 +-
 5 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index c7be1ca8e562..659d8ed5a3bc 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -62,6 +62,7 @@ struct inet_timewait_sock {
 #define tw_dr  __tw_common.skc_tw_dr
 
int tw_timeout;
+   __u32   tw_mark;
volatile unsigned char  tw_substate;
unsigned char   tw_rcv_wscale;
 
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 95adb171f852..b5e21eb198d8 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1561,7 +1561,7 @@ void ip_send_unicast_reply(struct sock *sk, struct 
sk_buff *skb,
oif = skb->skb_iif;
 
flowi4_init_output(, oif,
-  IP4_REPLY_MARK(net, skb->mark),
+  IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark,
   RT_TOS(arg->tos),
   RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
   ip_reply_arg_flowi_flags(arg),
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f70586b50838..caf23de88f8a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -621,6 +621,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct 
sk_buff *skb)
struct sock *sk1 = NULL;
 #endif
struct net *net;
+   struct sock *ctl_sk;
 
/* Never send a reset in response to a reset. */
if (th->rst)
@@ -723,11 +724,16 @@ static void tcp_v4_send_reset(const struct sock *sk, 
struct sk_buff *skb)
arg.tos = ip_hdr(skb)->tos;
arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
-   ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
+   ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
+   if (sk)
+   ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
+  inet_twsk(sk)->tw_mark : sk->sk_mark;
+   ip_send_unicast_reply(ctl_sk,
  skb, _SKB_CB(skb)->header.h4.opt,
  ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
  , arg.iov[0].iov_len);
 
+   ctl_sk->sk_mark = 0;
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
local_bh_enable();
@@ -759,6 +765,7 @@ static void tcp_v4_send_ack(const struct sock *sk,
} rep;
struct net *net = sock_net(sk);
struct ip_reply_arg arg;
+   struct sock *ctl_sk;
 
memset(, 0, sizeof(struct tcphdr));
memset(, 0, sizeof(arg));
@@ -809,11 +816,16 @@ static void tcp_v4_send_ack(const struct sock *sk,
arg.tos = tos;
arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
-   ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
+   ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
+   if (sk)
+   ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
+  inet_twsk(sk)->tw_mark : sk->sk_mark;
+   ip_send_unicast_reply(ctl_sk,
  skb, _SKB_CB(skb)->header.h4.opt,
  ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
  , arg.iov[0].iov_len);
 
+   ctl_sk->sk_mark = 0;
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
local_bh_enable();
 }
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index