Re: [PATCH] net: Require exact match for TCP socket lookups if dif is l3mdev
From: David Ahern Date: Sun, 16 Oct 2016 20:02:52 -0700 > Currently, socket lookups for l3mdev (vrf) use cases can match a socket > that is bound to a port but not a device (ie., a global socket). If the > sysctl tcp_l3mdev_accept is not set this leads to ack packets going out > based on the main table even though the packet came in from an L3 domain. > The end result is that the connection does not establish creating > confusion for users since the service is running and a socket shows in > ss output. Fix by requiring an exact dif to sk_bound_dev_if match if the > skb came through an interface enslaved to an l3mdev device and the > tcp_l3mdev_accept is not set. > > skb's through an l3mdev interface are marked by setting a flag in > inet{6}_skb_parm. The IPv6 variant is already set; this patch adds the > flag for IPv4. Using an skb flag avoids a device lookup on the dif. The > flag is set in the VRF driver using the IP{6}CB macros. For IPv4, the > inet_skb_parm struct is moved in the cb per commit 971f10eca186, so the > match function in the TCP stack needs to use TCP_SKB_CB. For IPv6, the > move is done after the socket lookup, so IP6CB is used. > > The flags field in inet_skb_parm struct needs to be increased to add > another flag. There is currently a 1-byte hole following the flags, > so it can be expanded to u16 without increasing the size of the struct. > > Fixes: 193125dbd8eb ("net: Introduce VRF device driver") > Signed-off-by: David Ahern Applied, thanks David.
[PATCH] net: Require exact match for TCP socket lookups if dif is l3mdev
Currently, socket lookups for l3mdev (vrf) use cases can match a socket that is bound to a port but not a device (ie., a global socket). If the sysctl tcp_l3mdev_accept is not set this leads to ack packets going out based on the main table even though the packet came in from an L3 domain. The end result is that the connection does not establish creating confusion for users since the service is running and a socket shows in ss output. Fix by requiring an exact dif to sk_bound_dev_if match if the skb came through an interface enslaved to an l3mdev device and the tcp_l3mdev_accept is not set. skb's through an l3mdev interface are marked by setting a flag in inet{6}_skb_parm. The IPv6 variant is already set; this patch adds the flag for IPv4. Using an skb flag avoids a device lookup on the dif. The flag is set in the VRF driver using the IP{6}CB macros. For IPv4, the inet_skb_parm struct is moved in the cb per commit 971f10eca186, so the match function in the TCP stack needs to use TCP_SKB_CB. For IPv6, the move is done after the socket lookup, so IP6CB is used. The flags field in inet_skb_parm struct needs to be increased to add another flag. There is currently a 1-byte hole following the flags, so it can be expanded to u16 without increasing the size of the struct. Fixes: 193125dbd8eb ("net: Introduce VRF device driver") Signed-off-by: David Ahern --- v4 - renamed existing skb_l3mdev_slave to ipv6_l3mdev_skb - renamed ipv4 version ipv4_l3mdev_skb v3 - changed the match functions to pull the skb flag from TCP_SKB_CB rather than IPCB for IPv4 per changes from 971f10eca186. match function is moved to tcp.h as a consequence. - made flags a u16 versus __u16 for consistency with frag_max_size - updated commit message v2 - reordered the checks in inet_exact_dif_match per Eric's comment - changed the l3mdev determination from looking up the dif to using a flag set on the skb which is much faster drivers/net/vrf.c | 2 ++ include/linux/ipv6.h| 17 ++--- include/net/ip.h| 8 +++- include/net/tcp.h | 13 - net/ipv4/inet_hashtables.c | 8 +--- net/ipv6/inet6_hashtables.c | 7 --- 6 files changed, 44 insertions(+), 11 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 85c271c70d42..820de6a9ddde 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -956,6 +956,7 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, if (skb->pkt_type == PACKET_LOOPBACK) { skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; + IP6CB(skb)->flags |= IP6SKB_L3SLAVE; skb->pkt_type = PACKET_HOST; goto out; } @@ -996,6 +997,7 @@ static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev, { skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; + IPCB(skb)->flags |= IPSKB_L3SLAVE; /* loopback traffic; do not push through packet taps again. * Reset pkt_type for upper layers to process skb diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 7e9a789be5e0..ca1ad9ebbc92 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -123,12 +123,12 @@ struct inet6_skb_parm { }; #if defined(CONFIG_NET_L3_MASTER_DEV) -static inline bool skb_l3mdev_slave(__u16 flags) +static inline bool ipv6_l3mdev_skb(__u16 flags) { return flags & IP6SKB_L3SLAVE; } #else -static inline bool skb_l3mdev_slave(__u16 flags) +static inline bool ipv6_l3mdev_skb(__u16 flags) { return false; } @@ -139,11 +139,22 @@ static inline bool skb_l3mdev_slave(__u16 flags) static inline int inet6_iif(const struct sk_buff *skb) { - bool l3_slave = skb_l3mdev_slave(IP6CB(skb)->flags); + bool l3_slave = ipv6_l3mdev_skb(IP6CB(skb)->flags); return l3_slave ? skb->skb_iif : IP6CB(skb)->iif; } +/* can not be used in TCP layer after tcp_v6_fill_cb */ +static inline bool inet6_exact_dif_match(struct net *net, struct sk_buff *skb) +{ +#if defined(CONFIG_NET_L3_MASTER_DEV) + if (!net->ipv4.sysctl_tcp_l3mdev_accept && + ipv6_l3mdev_skb(IP6CB(skb)->flags)) + return true; +#endif + return false; +} + struct tcp6_request_sock { struct tcp_request_sock tcp6rsk_tcp; }; diff --git a/include/net/ip.h b/include/net/ip.h index bc43c0fcae12..c9d07988911e 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -38,7 +38,7 @@ struct sock; struct inet_skb_parm { int iif; struct ip_options opt;/* Compiled IP options */ - unsigned char flags; + u16 flags; #define IPSKB_FORWARDEDBIT(0) #define IPSKB_XFRM_TUNNEL_SIZE BIT(1) @@ -48,10 +48,16 @@ struct inet_skb_parm { #define IPSKB_DOREDIRECT BIT(5) #define IPSKB_FRAG_PMTUBIT(6) #define IPSKB_FRAG_SEGSBIT(7) +#define IPSKB_L3SLAV
Re: [PATCH] net: Require exact match for TCP socket lookups if dif is l3mdev
On 10/13/16 3:29 PM, Eric Dumazet wrote: > Since netif_index_is_l3_master() is not cheap, can you reorder the > test ? > > if (!net->ipv4.sysctl_tcp_l3mdev_accept) > return netif_index_is_l3_master(net, dif); sure. Since this use case is called under rcu_read_lock I can make a netif_index_is_l3_master_rcu which saves a few cycles.
Re: [PATCH] net: Require exact match for TCP socket lookups if dif is l3mdev
On Thu, 2016-10-13 at 13:14 -0700, David Ahern wrote: > Currently, socket lookups for l3mdev (vrf) use cases can match a socket > that is bound to a port but not a device (ie., a global socket). If the > sysctl tcp_l3mdev_accept is not set this leads to ack packets going out > based on the main table even though the packet came in from an L3 domain. > The end result is that the connection does not establish creating > confusion for users since the service is running and a socket shows in > ss output. Fix by requiring an exact dif to sk_bound_dev_if match if the > dif is an l3mdev device and the tcp_l3mdev_accept is not set. > > Fixes: 193125dbd8eb ("net: Introduce VRF device driver") > Signed-off-by: David Ahern > --- > include/net/inet_sock.h | 10 ++ > net/ipv4/inet_hashtables.c | 7 --- > net/ipv6/inet6_hashtables.c | 7 --- > 3 files changed, 18 insertions(+), 6 deletions(-) > > diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h > index 236a81034fef..abb4e720a769 100644 > --- a/include/net/inet_sock.h > +++ b/include/net/inet_sock.h > @@ -132,6 +132,16 @@ static inline int inet_request_bound_dev_if(const struct > sock *sk, > return sk->sk_bound_dev_if; > } > > +static inline bool inet_exact_dif_match(struct net *net, int dif) > +{ > +#ifdef CONFIG_NET_L3_MASTER_DEV > + if (netif_index_is_l3_master(net, dif) && > + !net->ipv4.sysctl_tcp_l3mdev_accept) > + return true; Since netif_index_is_l3_master() is not cheap, can you reorder the test ? if (!net->ipv4.sysctl_tcp_l3mdev_accept) return netif_index_is_l3_master(net, dif);
[PATCH] net: Require exact match for TCP socket lookups if dif is l3mdev
Currently, socket lookups for l3mdev (vrf) use cases can match a socket that is bound to a port but not a device (ie., a global socket). If the sysctl tcp_l3mdev_accept is not set this leads to ack packets going out based on the main table even though the packet came in from an L3 domain. The end result is that the connection does not establish creating confusion for users since the service is running and a socket shows in ss output. Fix by requiring an exact dif to sk_bound_dev_if match if the dif is an l3mdev device and the tcp_l3mdev_accept is not set. Fixes: 193125dbd8eb ("net: Introduce VRF device driver") Signed-off-by: David Ahern --- include/net/inet_sock.h | 10 ++ net/ipv4/inet_hashtables.c | 7 --- net/ipv6/inet6_hashtables.c | 7 --- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 236a81034fef..abb4e720a769 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -132,6 +132,16 @@ static inline int inet_request_bound_dev_if(const struct sock *sk, return sk->sk_bound_dev_if; } +static inline bool inet_exact_dif_match(struct net *net, int dif) +{ +#ifdef CONFIG_NET_L3_MASTER_DEV + if (netif_index_is_l3_master(net, dif) && + !net->ipv4.sysctl_tcp_l3mdev_accept) + return true; +#endif + return false; +} + struct inet_cork { unsigned intflags; __be32 addr; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 77c20a489218..08b3598c43f9 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -172,7 +172,7 @@ EXPORT_SYMBOL_GPL(__inet_inherit_port); static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const __be32 daddr, - const int dif) + const int dif, bool exact_dif) { int score = -1; struct inet_sock *inet = inet_sk(sk); @@ -186,7 +186,7 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score += 4; } - if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if || exact_dif) { if (sk->sk_bound_dev_if != dif) return -1; score += 4; @@ -215,11 +215,12 @@ struct sock *__inet_lookup_listener(struct net *net, unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; int score, hiscore = 0, matches = 0, reuseport = 0; + bool exact_dif = inet_exact_dif_match(net, dif); struct sock *sk, *result = NULL; u32 phash = 0; sk_for_each_rcu(sk, &ilb->head) { - score = compute_score(sk, net, hnum, daddr, dif); + score = compute_score(sk, net, hnum, daddr, dif, exact_dif); if (score > hiscore) { reuseport = sk->sk_reuseport; if (reuseport) { diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 00cf28ad4565..f85924883056 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -96,7 +96,7 @@ EXPORT_SYMBOL(__inet6_lookup_established); static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const struct in6_addr *daddr, - const int dif) + const int dif, bool exact_dif) { int score = -1; @@ -109,7 +109,7 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score++; } - if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if || exact_dif) { if (sk->sk_bound_dev_if != dif) return -1; score++; @@ -131,11 +131,12 @@ struct sock *inet6_lookup_listener(struct net *net, unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; int score, hiscore = 0, matches = 0, reuseport = 0; + bool exact_dif = inet_exact_dif_match(net, dif); struct sock *sk, *result = NULL; u32 phash = 0; sk_for_each(sk, &ilb->head) { - score = compute_score(sk, net, hnum, daddr, dif); + score = compute_score(sk, net, hnum, daddr, dif, exact_dif); if (score > hiscore) { reuseport = sk->sk_reuseport; if (reuseport) { -- 2.1.4