Re: [PATCH net-next] tcp: Add mark for TIMEWAIT sockets
On Thu, May 10, 2018 at 1:32 PM, Eric Dumazetwrote: > > > On 05/09/2018 07:07 PM, Jon Maxwell wrote: >> Aidan McGurn from Openwave Mobility systems reported the following bug: >> >> "Marked routing is broken on customer deployment. Its effects are large >> increase in Uplink retransmissions caused by the client never receiving >> the final ACK to their FINACK - this ACK misses the mark and routes out >> of the incorrect route." >> >> Currently marks are added to sk_buffs for replies when the "fwmark_reflect" >> sysctl is enabled. But not for TIME_WAIT sockets where the original socket >> had >> sk->sk_mark set via setsockopt(SO_MARK..). >> >> Fix this in IPv4/v6 by adding tw->tw_mark for TIME_WAIT sockets. Copy the the >> original sk->sk_mark in __inet_twsk_hashdance() to the new tw->tw_mark >> location. >> Then copy this into ctl_sk->sk_mark so that the skb gets sent with the >> correct >> mark. Do the same for resets. Give the "fwmark_reflect" sysctl precedence >> over >> sk->sk_mark so that netfilter rules are still honored. >> >> Signed-off-by: Jon Maxwell >> --- >> include/net/inet_timewait_sock.h | 1 + >> net/ipv4/ip_output.c | 3 ++- >> net/ipv4/tcp_ipv4.c | 18 -- >> net/ipv4/tcp_minisocks.c | 1 + >> net/ipv6/tcp_ipv6.c | 8 +++- >> 5 files changed, 27 insertions(+), 4 deletions(-) >> >> diff --git a/include/net/inet_timewait_sock.h >> b/include/net/inet_timewait_sock.h >> index c7be1ca8e562..659d8ed5a3bc 100644 >> --- a/include/net/inet_timewait_sock.h >> +++ b/include/net/inet_timewait_sock.h >> @@ -62,6 +62,7 @@ struct inet_timewait_sock { >> #define tw_dr__tw_common.skc_tw_dr >> >> int tw_timeout; >> + __u32 tw_mark; >> volatile unsigned char tw_substate; >> unsigned char tw_rcv_wscale; >> >> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c >> index 95adb171f852..cca4412dc4cb 100644 >> --- a/net/ipv4/ip_output.c >> +++ b/net/ipv4/ip_output.c >> @@ -1539,6 +1539,7 @@ void ip_send_unicast_reply(struct sock *sk, struct >> sk_buff *skb, >> struct sk_buff *nskb; >> int err; >> int oif; >> + __u32 mark = IP4_REPLY_MARK(net, skb->mark); >> >> if (__ip_options_echo(net, , skb, sopt)) >> return; >> @@ -1561,7 +1562,7 @@ void ip_send_unicast_reply(struct sock *sk, struct >> sk_buff *skb, >> oif = skb->skb_iif; >> >> flowi4_init_output(, oif, >> -IP4_REPLY_MARK(net, skb->mark), >> +mark ? (mark) : sk->sk_mark, > > You can avoid the declaration of mark variable and simply use here : > > IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark, > Thanks for the advice and suggestions Eric. That is more elegant. Will do in v1. >> RT_TOS(arg->tos), >> RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, >> ip_reply_arg_flowi_flags(arg), >> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c >> index f70586b50838..fbee36579c83 100644 >> --- a/net/ipv4/tcp_ipv4.c >> +++ b/net/ipv4/tcp_ipv4.c >> @@ -621,6 +621,7 @@ static void tcp_v4_send_reset(const struct sock *sk, >> struct sk_buff *skb) >> struct sock *sk1 = NULL; >> #endif >> struct net *net; >> + struct sock *ctl_sk; >> >> /* Never send a reset in response to a reset. */ >> if (th->rst) >> @@ -723,11 +724,17 @@ static void tcp_v4_send_reset(const struct sock *sk, >> struct sk_buff *skb) >> arg.tos = ip_hdr(skb)->tos; >> arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); >> local_bh_disable(); >> - ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), >> + ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); >> + if (sk && sk->sk_state == TCP_TIME_WAIT) >> + ctl_sk->sk_mark = inet_twsk(sk)->tw_mark; >> + else if (sk && sk_fullsock(sk)) >> + ctl_sk->sk_mark = sk->sk_mark; >> + ip_send_unicast_reply(ctl_sk, >> skb, _SKB_CB(skb)->header.h4.opt, >> ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, >> , arg.iov[0].iov_len); >> >> + ctl_sk->sk_mark = 0; >> __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); >> __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); >> local_bh_enable(); >> @@ -759,6 +766,7 @@ static void tcp_v4_send_ack(const struct sock *sk, >> } rep; >> struct net *net = sock_net(sk); >> struct ip_reply_arg arg; >> + struct sock *ctl_sk; >> >> memset(, 0, sizeof(struct tcphdr)); >> memset(, 0, sizeof(arg)); >> @@ -809,11 +817,17 @@ static void tcp_v4_send_ack(const struct sock *sk, >> arg.tos = tos; >> arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); >> local_bh_disable(); >> -
Re: [PATCH net-next] tcp: Add mark for TIMEWAIT sockets
On Thu, May 10, 2018 at 1:32 PM, Eric Dumazet wrote: > > > On 05/09/2018 07:07 PM, Jon Maxwell wrote: >> Aidan McGurn from Openwave Mobility systems reported the following bug: >> >> "Marked routing is broken on customer deployment. Its effects are large >> increase in Uplink retransmissions caused by the client never receiving >> the final ACK to their FINACK - this ACK misses the mark and routes out >> of the incorrect route." >> >> Currently marks are added to sk_buffs for replies when the "fwmark_reflect" >> sysctl is enabled. But not for TIME_WAIT sockets where the original socket >> had >> sk->sk_mark set via setsockopt(SO_MARK..). >> >> Fix this in IPv4/v6 by adding tw->tw_mark for TIME_WAIT sockets. Copy the the >> original sk->sk_mark in __inet_twsk_hashdance() to the new tw->tw_mark >> location. >> Then copy this into ctl_sk->sk_mark so that the skb gets sent with the >> correct >> mark. Do the same for resets. Give the "fwmark_reflect" sysctl precedence >> over >> sk->sk_mark so that netfilter rules are still honored. >> >> Signed-off-by: Jon Maxwell >> --- >> include/net/inet_timewait_sock.h | 1 + >> net/ipv4/ip_output.c | 3 ++- >> net/ipv4/tcp_ipv4.c | 18 -- >> net/ipv4/tcp_minisocks.c | 1 + >> net/ipv6/tcp_ipv6.c | 8 +++- >> 5 files changed, 27 insertions(+), 4 deletions(-) >> >> diff --git a/include/net/inet_timewait_sock.h >> b/include/net/inet_timewait_sock.h >> index c7be1ca8e562..659d8ed5a3bc 100644 >> --- a/include/net/inet_timewait_sock.h >> +++ b/include/net/inet_timewait_sock.h >> @@ -62,6 +62,7 @@ struct inet_timewait_sock { >> #define tw_dr__tw_common.skc_tw_dr >> >> int tw_timeout; >> + __u32 tw_mark; >> volatile unsigned char tw_substate; >> unsigned char tw_rcv_wscale; >> >> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c >> index 95adb171f852..cca4412dc4cb 100644 >> --- a/net/ipv4/ip_output.c >> +++ b/net/ipv4/ip_output.c >> @@ -1539,6 +1539,7 @@ void ip_send_unicast_reply(struct sock *sk, struct >> sk_buff *skb, >> struct sk_buff *nskb; >> int err; >> int oif; >> + __u32 mark = IP4_REPLY_MARK(net, skb->mark); >> >> if (__ip_options_echo(net, , skb, sopt)) >> return; >> @@ -1561,7 +1562,7 @@ void ip_send_unicast_reply(struct sock *sk, struct >> sk_buff *skb, >> oif = skb->skb_iif; >> >> flowi4_init_output(, oif, >> -IP4_REPLY_MARK(net, skb->mark), >> +mark ? (mark) : sk->sk_mark, > > You can avoid the declaration of mark variable and simply use here : > > IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark, > Thanks for the advice and suggestions Eric. That is more elegant. Will do in v1. >> RT_TOS(arg->tos), >> RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, >> ip_reply_arg_flowi_flags(arg), >> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c >> index f70586b50838..fbee36579c83 100644 >> --- a/net/ipv4/tcp_ipv4.c >> +++ b/net/ipv4/tcp_ipv4.c >> @@ -621,6 +621,7 @@ static void tcp_v4_send_reset(const struct sock *sk, >> struct sk_buff *skb) >> struct sock *sk1 = NULL; >> #endif >> struct net *net; >> + struct sock *ctl_sk; >> >> /* Never send a reset in response to a reset. */ >> if (th->rst) >> @@ -723,11 +724,17 @@ static void tcp_v4_send_reset(const struct sock *sk, >> struct sk_buff *skb) >> arg.tos = ip_hdr(skb)->tos; >> arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); >> local_bh_disable(); >> - ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), >> + ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); >> + if (sk && sk->sk_state == TCP_TIME_WAIT) >> + ctl_sk->sk_mark = inet_twsk(sk)->tw_mark; >> + else if (sk && sk_fullsock(sk)) >> + ctl_sk->sk_mark = sk->sk_mark; >> + ip_send_unicast_reply(ctl_sk, >> skb, _SKB_CB(skb)->header.h4.opt, >> ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, >> , arg.iov[0].iov_len); >> >> + ctl_sk->sk_mark = 0; >> __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); >> __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); >> local_bh_enable(); >> @@ -759,6 +766,7 @@ static void tcp_v4_send_ack(const struct sock *sk, >> } rep; >> struct net *net = sock_net(sk); >> struct ip_reply_arg arg; >> + struct sock *ctl_sk; >> >> memset(, 0, sizeof(struct tcphdr)); >> memset(, 0, sizeof(arg)); >> @@ -809,11 +817,17 @@ static void tcp_v4_send_ack(const struct sock *sk, >> arg.tos = tos; >> arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); >> local_bh_disable(); >> - ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), >> + ctl_sk =
Re: [PATCH net-next] tcp: Add mark for TIMEWAIT sockets
On 05/09/2018 07:07 PM, Jon Maxwell wrote: > Aidan McGurn from Openwave Mobility systems reported the following bug: > > "Marked routing is broken on customer deployment. Its effects are large > increase in Uplink retransmissions caused by the client never receiving > the final ACK to their FINACK - this ACK misses the mark and routes out > of the incorrect route." > > Currently marks are added to sk_buffs for replies when the "fwmark_reflect" > sysctl is enabled. But not for TIME_WAIT sockets where the original socket > had > sk->sk_mark set via setsockopt(SO_MARK..). > > Fix this in IPv4/v6 by adding tw->tw_mark for TIME_WAIT sockets. Copy the the > original sk->sk_mark in __inet_twsk_hashdance() to the new tw->tw_mark > location. > Then copy this into ctl_sk->sk_mark so that the skb gets sent with the > correct > mark. Do the same for resets. Give the "fwmark_reflect" sysctl precedence > over > sk->sk_mark so that netfilter rules are still honored. > > Signed-off-by: Jon Maxwell> --- > include/net/inet_timewait_sock.h | 1 + > net/ipv4/ip_output.c | 3 ++- > net/ipv4/tcp_ipv4.c | 18 -- > net/ipv4/tcp_minisocks.c | 1 + > net/ipv6/tcp_ipv6.c | 8 +++- > 5 files changed, 27 insertions(+), 4 deletions(-) > > diff --git a/include/net/inet_timewait_sock.h > b/include/net/inet_timewait_sock.h > index c7be1ca8e562..659d8ed5a3bc 100644 > --- a/include/net/inet_timewait_sock.h > +++ b/include/net/inet_timewait_sock.h > @@ -62,6 +62,7 @@ struct inet_timewait_sock { > #define tw_dr__tw_common.skc_tw_dr > > int tw_timeout; > + __u32 tw_mark; > volatile unsigned char tw_substate; > unsigned char tw_rcv_wscale; > > diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c > index 95adb171f852..cca4412dc4cb 100644 > --- a/net/ipv4/ip_output.c > +++ b/net/ipv4/ip_output.c > @@ -1539,6 +1539,7 @@ void ip_send_unicast_reply(struct sock *sk, struct > sk_buff *skb, > struct sk_buff *nskb; > int err; > int oif; > + __u32 mark = IP4_REPLY_MARK(net, skb->mark); > > if (__ip_options_echo(net, , skb, sopt)) > return; > @@ -1561,7 +1562,7 @@ void ip_send_unicast_reply(struct sock *sk, struct > sk_buff *skb, > oif = skb->skb_iif; > > flowi4_init_output(, oif, > -IP4_REPLY_MARK(net, skb->mark), > +mark ? (mark) : sk->sk_mark, You can avoid the declaration of mark variable and simply use here : IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark, > RT_TOS(arg->tos), > RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, > ip_reply_arg_flowi_flags(arg), > diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c > index f70586b50838..fbee36579c83 100644 > --- a/net/ipv4/tcp_ipv4.c > +++ b/net/ipv4/tcp_ipv4.c > @@ -621,6 +621,7 @@ static void tcp_v4_send_reset(const struct sock *sk, > struct sk_buff *skb) > struct sock *sk1 = NULL; > #endif > struct net *net; > + struct sock *ctl_sk; > > /* Never send a reset in response to a reset. */ > if (th->rst) > @@ -723,11 +724,17 @@ static void tcp_v4_send_reset(const struct sock *sk, > struct sk_buff *skb) > arg.tos = ip_hdr(skb)->tos; > arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); > local_bh_disable(); > - ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), > + ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); > + if (sk && sk->sk_state == TCP_TIME_WAIT) > + ctl_sk->sk_mark = inet_twsk(sk)->tw_mark; > + else if (sk && sk_fullsock(sk)) > + ctl_sk->sk_mark = sk->sk_mark; > + ip_send_unicast_reply(ctl_sk, > skb, _SKB_CB(skb)->header.h4.opt, > ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, > , arg.iov[0].iov_len); > > + ctl_sk->sk_mark = 0; > __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); > __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); > local_bh_enable(); > @@ -759,6 +766,7 @@ static void tcp_v4_send_ack(const struct sock *sk, > } rep; > struct net *net = sock_net(sk); > struct ip_reply_arg arg; > + struct sock *ctl_sk; > > memset(, 0, sizeof(struct tcphdr)); > memset(, 0, sizeof(arg)); > @@ -809,11 +817,17 @@ static void tcp_v4_send_ack(const struct sock *sk, > arg.tos = tos; > arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); > local_bh_disable(); > - ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), > + ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); > + if (sk && sk->sk_state == TCP_TIME_WAIT) > + ctl_sk->sk_mark = inet_twsk(sk)->tw_mark; > + else if (sk && sk_fullsock(sk)) > +
Re: [PATCH net-next] tcp: Add mark for TIMEWAIT sockets
On 05/09/2018 07:07 PM, Jon Maxwell wrote: > Aidan McGurn from Openwave Mobility systems reported the following bug: > > "Marked routing is broken on customer deployment. Its effects are large > increase in Uplink retransmissions caused by the client never receiving > the final ACK to their FINACK - this ACK misses the mark and routes out > of the incorrect route." > > Currently marks are added to sk_buffs for replies when the "fwmark_reflect" > sysctl is enabled. But not for TIME_WAIT sockets where the original socket > had > sk->sk_mark set via setsockopt(SO_MARK..). > > Fix this in IPv4/v6 by adding tw->tw_mark for TIME_WAIT sockets. Copy the the > original sk->sk_mark in __inet_twsk_hashdance() to the new tw->tw_mark > location. > Then copy this into ctl_sk->sk_mark so that the skb gets sent with the > correct > mark. Do the same for resets. Give the "fwmark_reflect" sysctl precedence > over > sk->sk_mark so that netfilter rules are still honored. > > Signed-off-by: Jon Maxwell > --- > include/net/inet_timewait_sock.h | 1 + > net/ipv4/ip_output.c | 3 ++- > net/ipv4/tcp_ipv4.c | 18 -- > net/ipv4/tcp_minisocks.c | 1 + > net/ipv6/tcp_ipv6.c | 8 +++- > 5 files changed, 27 insertions(+), 4 deletions(-) > > diff --git a/include/net/inet_timewait_sock.h > b/include/net/inet_timewait_sock.h > index c7be1ca8e562..659d8ed5a3bc 100644 > --- a/include/net/inet_timewait_sock.h > +++ b/include/net/inet_timewait_sock.h > @@ -62,6 +62,7 @@ struct inet_timewait_sock { > #define tw_dr__tw_common.skc_tw_dr > > int tw_timeout; > + __u32 tw_mark; > volatile unsigned char tw_substate; > unsigned char tw_rcv_wscale; > > diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c > index 95adb171f852..cca4412dc4cb 100644 > --- a/net/ipv4/ip_output.c > +++ b/net/ipv4/ip_output.c > @@ -1539,6 +1539,7 @@ void ip_send_unicast_reply(struct sock *sk, struct > sk_buff *skb, > struct sk_buff *nskb; > int err; > int oif; > + __u32 mark = IP4_REPLY_MARK(net, skb->mark); > > if (__ip_options_echo(net, , skb, sopt)) > return; > @@ -1561,7 +1562,7 @@ void ip_send_unicast_reply(struct sock *sk, struct > sk_buff *skb, > oif = skb->skb_iif; > > flowi4_init_output(, oif, > -IP4_REPLY_MARK(net, skb->mark), > +mark ? (mark) : sk->sk_mark, You can avoid the declaration of mark variable and simply use here : IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark, > RT_TOS(arg->tos), > RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, > ip_reply_arg_flowi_flags(arg), > diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c > index f70586b50838..fbee36579c83 100644 > --- a/net/ipv4/tcp_ipv4.c > +++ b/net/ipv4/tcp_ipv4.c > @@ -621,6 +621,7 @@ static void tcp_v4_send_reset(const struct sock *sk, > struct sk_buff *skb) > struct sock *sk1 = NULL; > #endif > struct net *net; > + struct sock *ctl_sk; > > /* Never send a reset in response to a reset. */ > if (th->rst) > @@ -723,11 +724,17 @@ static void tcp_v4_send_reset(const struct sock *sk, > struct sk_buff *skb) > arg.tos = ip_hdr(skb)->tos; > arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); > local_bh_disable(); > - ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), > + ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); > + if (sk && sk->sk_state == TCP_TIME_WAIT) > + ctl_sk->sk_mark = inet_twsk(sk)->tw_mark; > + else if (sk && sk_fullsock(sk)) > + ctl_sk->sk_mark = sk->sk_mark; > + ip_send_unicast_reply(ctl_sk, > skb, _SKB_CB(skb)->header.h4.opt, > ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, > , arg.iov[0].iov_len); > > + ctl_sk->sk_mark = 0; > __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); > __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); > local_bh_enable(); > @@ -759,6 +766,7 @@ static void tcp_v4_send_ack(const struct sock *sk, > } rep; > struct net *net = sock_net(sk); > struct ip_reply_arg arg; > + struct sock *ctl_sk; > > memset(, 0, sizeof(struct tcphdr)); > memset(, 0, sizeof(arg)); > @@ -809,11 +817,17 @@ static void tcp_v4_send_ack(const struct sock *sk, > arg.tos = tos; > arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); > local_bh_disable(); > - ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), > + ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); > + if (sk && sk->sk_state == TCP_TIME_WAIT) > + ctl_sk->sk_mark = inet_twsk(sk)->tw_mark; > + else if (sk && sk_fullsock(sk)) > + ctl_sk->sk_mark = sk->sk_mark; > +