[PATCH net-next v5 9/9] ipv6: do not drop vrf udp multicast packets
From: Dewi Morgan For bound udp sockets in a vrf, also check the sdif to get the index for ingress devices enslaved to an l3mdev. Signed-off-by: Dewi Morgan Signed-off-by: Mike Manning --- net/ipv6/udp.c | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 895fa77bde90..a905bf9ed906 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -637,7 +637,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk, __be16 loc_port, const struct in6_addr *loc_addr, __be16 rmt_port, const struct in6_addr *rmt_addr, - int dif, unsigned short hnum) + int dif, int sdif, unsigned short hnum) { struct inet_sock *inet = inet_sk(sk); @@ -649,7 +649,7 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk, (inet->inet_dport && inet->inet_dport != rmt_port) || (!ipv6_addr_any(>sk_v6_daddr) && !ipv6_addr_equal(>sk_v6_daddr, rmt_addr)) || - (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) || + !udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif) || (!ipv6_addr_any(>sk_v6_rcv_saddr) && !ipv6_addr_equal(>sk_v6_rcv_saddr, loc_addr))) return false; @@ -683,6 +683,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, unsigned int offset = offsetof(typeof(*sk), sk_node); unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); int dif = inet6_iif(skb); + int sdif = inet6_sdif(skb); struct hlist_node *node; struct sk_buff *nskb; @@ -697,7 +698,8 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, sk_for_each_entry_offset_rcu(sk, node, >head, offset) { if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr, - uh->source, saddr, dif, hnum)) + uh->source, saddr, dif, sdif, + hnum)) continue; /* If zero checksum and no_check is not on for * the socket then skip it. -- 2.11.0
[PATCH net-next v5 8/9] ipv6: handling of multicast packets received in VRF
If the skb for multicast packets marked as enslaved to a VRF are received, then the secondary device index should be used to obtain the real device. And verify the multicast address against the enslaved rather than the l3mdev device. Signed-off-by: Dewi Morgan Signed-off-by: Mike Manning --- net/ipv6/ip6_input.c | 35 --- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 96577e742afd..df58e1100226 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -359,6 +359,8 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk } } else if (ipprot->flags & INET6_PROTO_FINAL) { const struct ipv6hdr *hdr; + int sdif = inet6_sdif(skb); + struct net_device *dev; /* Only do this once for first final protocol */ have_final = true; @@ -371,9 +373,19 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); hdr = ipv6_hdr(skb); + + /* skb->dev passed may be master dev for vrfs. */ + if (sdif) { + dev = dev_get_by_index_rcu(net, sdif); + if (!dev) + goto discard; + } else { + dev = skb->dev; + } + if (ipv6_addr_is_multicast(>daddr) && - !ipv6_chk_mcast_addr(skb->dev, >daddr, - >saddr) && + !ipv6_chk_mcast_addr(dev, >daddr, +>saddr) && !ipv6_is_mld(skb, nexthdr, skb_network_header_len(skb))) goto discard; } @@ -432,15 +444,32 @@ EXPORT_SYMBOL_GPL(ip6_input); int ip6_mc_input(struct sk_buff *skb) { + int sdif = inet6_sdif(skb); const struct ipv6hdr *hdr; + struct net_device *dev; bool deliver; __IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev), __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST, skb->len); + /* skb->dev passed may be master dev for vrfs. */ + if (sdif) { + rcu_read_lock(); + dev = dev_get_by_index_rcu(dev_net(skb->dev), sdif); + if (!dev) { + rcu_read_unlock(); + kfree_skb(skb); + return -ENODEV; + } + } else { + dev = skb->dev; + } + hdr = ipv6_hdr(skb); - deliver = ipv6_chk_mcast_addr(skb->dev, >daddr, NULL); + deliver = ipv6_chk_mcast_addr(dev, >daddr, NULL); + if (sdif) + rcu_read_unlock(); #ifdef CONFIG_IPV6_MROUTE /* -- 2.11.0
[PATCH net-next v5 7/9] ipv6: allow ping to link-local address in VRF
If link-local packets are marked as enslaved to a VRF, then to allow ping to the link-local from a vrf, the error handling for IPV6_PKTINFO needs to be relaxed to also allow the pkt ipi6_ifindex to be that of a slave device to the vrf. Note that the real device also needs to be retrieved in icmp6_iif() to set the ipv6 flow oif to this for icmp echo reply handling. The recent commit 24b711edfc34 ("net/ipv6: Fix linklocal to global address with VRF") takes care of this, so the sdif does not need checking here. This fix makes ping to link-local consistent with that to global addresses, in that this can now be done from within the same VRF that the address is in. Signed-off-by: Mike Manning --- net/ipv6/ipv6_sockglue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 381ce38940ae..973e215c3114 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -486,7 +486,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, retv = -EFAULT; break; } - if (sk->sk_bound_dev_if && pkt.ipi6_ifindex != sk->sk_bound_dev_if) + if (!sk_dev_equal_l3scope(sk, pkt.ipi6_ifindex)) goto e_inval; np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex; -- 2.11.0
[PATCH net-next v5 4/9] net: provide a sysctl raw_l3mdev_accept for raw socket lookup with VRFs
Add a sysctl raw_l3mdev_accept to control raw socket lookup in a manner similar to use of tcp_l3mdev_accept for stream and of udp_l3mdev_accept for datagram sockets. Have this default to enabled for reasons of backwards compatibility. This is so as to specify the output device with cmsg and IP_PKTINFO, but using a socket not bound to the corresponding VRF. This allows e.g. older ping implementations to be run with specifying the device but without executing it in the VRF. If the option is disabled, packets received in a VRF context are only handled by a raw socket bound to the VRF, and correspondingly packets in the default VRF are only handled by a socket not bound to any VRF. Signed-off-by: Mike Manning --- Documentation/networking/ip-sysctl.txt | 12 Documentation/networking/vrf.txt | 13 + include/net/netns/ipv4.h | 3 +++ include/net/raw.h | 1 + net/ipv4/af_inet.c | 2 ++ net/ipv4/raw.c | 28 ++-- net/ipv4/sysctl_net_ipv4.c | 11 +++ 7 files changed, 68 insertions(+), 2 deletions(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 163b5ff1073c..e0e72e2ff6b2 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -359,6 +359,7 @@ tcp_l3mdev_accept - BOOLEAN derived from the listen socket to be bound to the L3 domain in which the packets originated. Only valid when the kernel was compiled with CONFIG_NET_L3_MASTER_DEV. +Default: 0 (disabled) tcp_low_latency - BOOLEAN This is a legacy option, it has no effect anymore. @@ -762,6 +763,7 @@ udp_l3mdev_accept - BOOLEAN being received regardless of the L3 domain in which they originated. Only valid when the kernel was compiled with CONFIG_NET_L3_MASTER_DEV. +Default: 0 (disabled) udp_mem - vector of 3 INTEGERs: min, pressure, max Number of pages allowed for queueing by all UDP sockets. @@ -788,6 +790,16 @@ udp_wmem_min - INTEGER total pages of UDP sockets exceed udp_mem pressure. The unit is byte. Default: 4K +RAW variables: + +raw_l3mdev_accept - BOOLEAN + Enabling this option allows a "global" bound socket to work + across L3 master domains (e.g., VRFs) with packets capable of + being received regardless of the L3 domain in which they + originated. Only valid when the kernel was compiled with + CONFIG_NET_L3_MASTER_DEV. + Default: 1 (enabled) + CIPSOv4 Variables: cipso_cache_enable - BOOLEAN diff --git a/Documentation/networking/vrf.txt b/Documentation/networking/vrf.txt index d4b129402d57..a5f103b083a0 100644 --- a/Documentation/networking/vrf.txt +++ b/Documentation/networking/vrf.txt @@ -111,9 +111,22 @@ the same port if they bind to an l3mdev. TCP & UDP services running in the default VRF context (ie., not bound to any VRF device) can work across all VRF domains by enabling the tcp_l3mdev_accept and udp_l3mdev_accept sysctl options: + sysctl -w net.ipv4.tcp_l3mdev_accept=1 sysctl -w net.ipv4.udp_l3mdev_accept=1 +These options are disabled by default so that a socket in a VRF is only +selected for packets in that VRF. There is a similar option for RAW +sockets, which is enabled by default for reasons of backwards compatibility. +This is so as to specify the output device with cmsg and IP_PKTINFO, but +using a socket not bound to the corresponding VRF. This allows e.g. older ping +implementations to be run with specifying the device but without executing it +in the VRF. This option can be disabled so that packets received in a VRF +context are only handled by a raw socket bound to the VRF, and packets in the +default VRF are only handled by a socket not bound to any VRF: + +sysctl -w net.ipv4.raw_l3mdev_accept=0 + netfilter rules on the VRF device can be used to limit access to services running in the default VRF context as well. diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index e47503b4e4d1..104a6669e344 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -103,6 +103,9 @@ struct netns_ipv4 { /* Shall we try to damage output packets if routing dev changes? */ int sysctl_ip_dynaddr; int sysctl_ip_early_demux; +#ifdef CONFIG_NET_L3_MASTER_DEV + int sysctl_raw_l3mdev_accept; +#endif int sysctl_tcp_early_demux; int sysctl_udp_early_demux; diff --git a/include/net/raw.h b/include/net/raw.h index 9c9fa98a91a4..20ebf0b3dfa8 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -61,6 +61,7 @@ void raw_seq_stop(struct seq_file *seq, void *v); int raw_hash_sk(struct sock *sk); void raw_unhash_sk(struct sock *sk); +void raw_init(void); struct raw_sock { /* inet_sock has to be the first member */ diff --git a/net/ipv4/af_ine
[PATCH net-next v5 5/9] net: fix raw socket lookup device bind matching with VRFs
From: Duncan Eastoe When there exist a pair of raw sockets one unbound and one bound to a VRF but equal in all other respects, when a packet is received in the VRF context, __raw_v4_lookup() matches on both sockets. This results in the packet being delivered over both sockets, instead of only the raw socket bound to the VRF. The bound device checks in __raw_v4_lookup() are replaced with a call to raw_sk_bound_dev_eq() which correctly handles whether the packet should be delivered over the unbound socket in such cases. In __raw_v6_lookup() the match on the device binding of the socket is similarly updated to use raw_sk_bound_dev_eq() which matches the handling in __raw_v4_lookup(). Importantly raw_sk_bound_dev_eq() takes the raw_l3mdev_accept sysctl into account. Signed-off-by: Duncan Eastoe Signed-off-by: Mike Manning --- include/net/raw.h | 13 - net/ipv4/raw.c| 3 +-- net/ipv6/raw.c| 5 ++--- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/include/net/raw.h b/include/net/raw.h index 20ebf0b3dfa8..821ff4887f77 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -17,7 +17,7 @@ #ifndef _RAW_H #define _RAW_H - +#include #include #include @@ -75,4 +75,15 @@ static inline struct raw_sock *raw_sk(const struct sock *sk) return (struct raw_sock *)sk; } +static inline bool raw_sk_bound_dev_eq(struct net *net, int bound_dev_if, + int dif, int sdif) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + return inet_bound_dev_eq(!!net->ipv4.sysctl_raw_l3mdev_accept, +bound_dev_if, dif, sdif); +#else + return inet_bound_dev_eq(true, bound_dev_if, dif, sdif); +#endif +} + #endif /* _RAW_H */ diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 1ebd29abe79c..fb1f02015a15 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -131,8 +131,7 @@ struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, if (net_eq(sock_net(sk), net) && inet->inet_num == num && !(inet->inet_daddr && inet->inet_daddr != raddr)&& !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && - !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif && - sk->sk_bound_dev_if != sdif)) + raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) goto found; /* gotcha */ } sk = NULL; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 5e0efd3954e9..aed7eb5c2123 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -86,9 +86,8 @@ struct sock *__raw_v6_lookup(struct net *net, struct sock *sk, !ipv6_addr_equal(>sk_v6_daddr, rmt_addr)) continue; - if (sk->sk_bound_dev_if && - sk->sk_bound_dev_if != dif && - sk->sk_bound_dev_if != sdif) + if (!raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, +dif, sdif)) continue; if (!ipv6_addr_any(>sk_v6_rcv_saddr)) { -- 2.11.0
[PATCH net-next v5 0/9] vrf: allow simultaneous service instances in default and other VRFs
Services currently have to be VRF-aware if they are using an unbound socket. One cannot have multiple service instances running in the default and other VRFs for services that are not VRF-aware and listen on an unbound socket. This is because there is no easy way of isolating packets received in the default VRF from those arriving in other VRFs. This series provides this isolation for stream sockets subject to the existing kernel parameter net.ipv4.tcp_l3mdev_accept not being set, given that this is documented as allowing a single service instance to work across all VRF domains. Similarly, net.ipv4.udp_l3mdev_accept is checked for datagram sockets, and net.ipv4.raw_l3mdev_accept is introduced for raw sockets. The functionality applies to UDP & TCP services as well as those using raw sockets, and is for IPv4 and IPv6. Example of running ssh instances in default and blue VRF: $ /usr/sbin/sshd -D $ ip vrf exec vrf-blue /usr/sbin/sshd $ ss -ta | egrep 'State|ssh' State Recv-Q Send-Q Local Address:Port Peer Address:Port LISTEN 0128 0.0.0.0%vrf-blue:ssh 0.0.0.0:* LISTEN 01280.0.0.0:ssh 0.0.0.0:* ESTAB 00 192.168.122.220:ssh 192.168.122.1:50282 LISTEN 0128 [::]%vrf-blue:ssh[::]:* LISTEN 0128 [::]:ssh[::]:* ESTAB 00 [3000::2]%vrf-blue:ssh [3000::9]:45896 ESTAB 00[2000::2]:ssh [2000::9]:46398 v1: - Address Paolo Abeni's comments (patch 4/5) - Fix build when CONFIG_NET_L3_MASTER_DEV not defined (patch 1/5) v2: - Address David Aherns' comments (patches 4/5 and 5/5) - Remove patches 3/5 and 5/5 from series for individual submissions - Include a sysctl for raw sockets as recommended by David Ahern - Expand series into 10 patches and provide improved descriptions v3: - Update description for patch 1/10 and remove patch 6/10 v4: - Set default to enabled for raw socket sysctl as recommended by David Ahern v5: - Address review comments from David Ahern in patches 2-5 Dewi Morgan (1): ipv6: do not drop vrf udp multicast packets Duncan Eastoe (1): net: fix raw socket lookup device bind matching with VRFs Mike Manning (6): net: ensure unbound stream socket to be chosen when not in a VRF net: ensure unbound datagram socket to be chosen when not in a VRF net: provide a sysctl raw_l3mdev_accept for raw socket lookup with VRFs vrf: mark skb for multicast or link-local as enslaved to VRF ipv6: allow ping to link-local address in VRF ipv6: handling of multicast packets received in VRF Robert Shearman (1): net: allow binding socket in a VRF when there's an unbound socket Documentation/networking/ip-sysctl.txt | 12 Documentation/networking/vrf.txt | 22 + drivers/net/vrf.c | 19 +- include/net/inet6_hashtables.h | 5 ++--- include/net/inet_hashtables.h | 24 --- include/net/inet_sock.h| 21 include/net/netns/ipv4.h | 3 +++ include/net/raw.h | 14 +- include/net/udp.h | 11 +++ net/core/sock.c| 2 ++ net/ipv4/af_inet.c | 2 ++ net/ipv4/inet_connection_sock.c| 13 ++--- net/ipv4/inet_hashtables.c | 34 - net/ipv4/raw.c | 31 ++ net/ipv4/sysctl_net_ipv4.c | 11 +++ net/ipv4/udp.c | 15 ++- net/ipv6/datagram.c| 10 +++--- net/ipv6/inet6_hashtables.c| 14 ++ net/ipv6/ip6_input.c | 35 +++--- net/ipv6/ipv6_sockglue.c | 2 +- net/ipv6/raw.c | 5 ++--- net/ipv6/udp.c | 22 ++--- 22 files changed, 243 insertions(+), 84 deletions(-) -- 2.11.0
[PATCH net-next v5 1/9] net: allow binding socket in a VRF when there's an unbound socket
From: Robert Shearman Change the inet socket lookup to avoid packets arriving on a device enslaved to an l3mdev from matching unbound sockets by removing the wildcard for non sk_bound_dev_if and instead relying on check against the secondary device index, which will be 0 when the input device is not enslaved to an l3mdev and so match against an unbound socket and not match when the input device is enslaved. Change the socket binding to take the l3mdev into account to allow an unbound socket to not conflict sockets bound to an l3mdev given the datapath isolation now guaranteed. Signed-off-by: Robert Shearman Signed-off-by: Mike Manning --- Documentation/networking/vrf.txt | 9 + include/net/inet6_hashtables.h | 5 ++--- include/net/inet_hashtables.h| 13 ++--- include/net/inet_sock.h | 13 + net/ipv4/inet_connection_sock.c | 13 ++--- net/ipv4/inet_hashtables.c | 20 +++- 6 files changed, 51 insertions(+), 22 deletions(-) diff --git a/Documentation/networking/vrf.txt b/Documentation/networking/vrf.txt index 8ff7b4c8f91b..d4b129402d57 100644 --- a/Documentation/networking/vrf.txt +++ b/Documentation/networking/vrf.txt @@ -103,6 +103,11 @@ VRF device: or to specify the output device using cmsg and IP_PKTINFO. +By default the scope of the port bindings for unbound sockets is +limited to the default VRF. That is, it will not be matched by packets +arriving on interfaces enslaved to an l3mdev and processes may bind to +the same port if they bind to an l3mdev. + TCP & UDP services running in the default VRF context (ie., not bound to any VRF device) can work across all VRF domains by enabling the tcp_l3mdev_accept and udp_l3mdev_accept sysctl options: @@ -112,10 +117,6 @@ tcp_l3mdev_accept and udp_l3mdev_accept sysctl options: netfilter rules on the VRF device can be used to limit access to services running in the default VRF context as well. -The default VRF does not have limited scope with respect to port bindings. -That is, if a process does a wildcard bind to a port in the default VRF it -owns the port across all VRF domains within the network namespace. - Using iproute2 for VRFs diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 6e91e38a31da..9db98af46985 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -115,9 +115,8 @@ int inet6_hash(struct sock *sk); ((__sk)->sk_family == AF_INET6)&& \ ipv6_addr_equal(&(__sk)->sk_v6_daddr, (__saddr)) && \ ipv6_addr_equal(&(__sk)->sk_v6_rcv_saddr, (__daddr)) && \ -(!(__sk)->sk_bound_dev_if || \ - ((__sk)->sk_bound_dev_if == (__dif)) || \ - ((__sk)->sk_bound_dev_if == (__sdif))) && \ +(((__sk)->sk_bound_dev_if == (__dif)) || \ + ((__sk)->sk_bound_dev_if == (__sdif)))&& \ net_eq(sock_net(__sk), (__net))) #endif /* _INET6_HASHTABLES_H */ diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 9141e95529e7..4ae060b4bac2 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -79,6 +79,7 @@ struct inet_ehash_bucket { struct inet_bind_bucket { possible_net_t ib_net; + int l3mdev; unsigned short port; signed char fastreuse; signed char fastreuseport; @@ -191,7 +192,7 @@ static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) struct inet_bind_bucket * inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, - const unsigned short snum); + const unsigned short snum, int l3mdev); void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb); @@ -282,9 +283,8 @@ static inline struct sock *inet_lookup_listener(struct net *net, #define INET_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif, __sdif) \ (((__sk)->sk_portpair == (__ports)) && \ ((__sk)->sk_addrpair == (__cookie))&& \ -(!(__sk)->sk_bound_dev_if || \ - ((__sk)->sk_bound_dev_if == (__dif)) || \ - ((__sk)->sk_bound_dev_if == (__sdif))) && \ +(((__sk)->sk_bound_dev_if == (__dif)) || \ + ((__sk)->sk_bound_dev_if == (__sdif)))&& \
[PATCH net-next v5 3/9] net: ensure unbound datagram socket to be chosen when not in a VRF
Ensure an unbound datagram skt is chosen when not in a VRF. The check for a device match in compute_score() for UDP must be performed when there is no device match. For this, a failure is returned when there is no device match. This ensures that bound sockets are never selected, even if there is no unbound socket. Allow IPv6 packets to be sent over a datagram skt bound to a VRF. These packets are currently blocked, as flowi6_oif was set to that of the master vrf device, and the ipi6_ifindex is that of the slave device. Allow these packets to be sent by checking the device with ipi6_ifindex has the same L3 scope as that of the bound device of the skt, which is the master vrf device. Note that this check always succeeds if the skt is unbound. Even though the right datagram skt is now selected by compute_score(), a different skt is being returned that is bound to the wrong vrf. The difference between these and stream sockets is the handling of the skt option for SO_REUSEPORT. While the handling when adding a skt for reuse correctly checks that the bound device of the skt is a match, the skts in the hashslot are already incorrect. So for the same hash, a skt for the wrong vrf may be selected for the required port. The root cause is that the skt is immediately placed into a slot when it is created, but when the skt is then bound using SO_BINDTODEVICE, it remains in the same slot. The solution is to move the skt to the correct slot by forcing a rehash. Signed-off-by: Mike Manning --- include/net/udp.h | 11 +++ net/core/sock.c | 2 ++ net/ipv4/udp.c | 15 ++- net/ipv6/datagram.c | 10 +++--- net/ipv6/udp.c | 14 +- 5 files changed, 31 insertions(+), 21 deletions(-) diff --git a/include/net/udp.h b/include/net/udp.h index 9e82cb391dea..a496e441645e 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -252,6 +252,17 @@ static inline int udp_rqueue_get(struct sock *sk) return sk_rmem_alloc_get(sk) - READ_ONCE(udp_sk(sk)->forward_deficit); } +static inline bool udp_sk_bound_dev_eq(struct net *net, int bound_dev_if, + int dif, int sdif) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + return inet_bound_dev_eq(!!net->ipv4.sysctl_udp_l3mdev_accept, +bound_dev_if, dif, sdif); +#else + return inet_bound_dev_eq(true, bound_dev_if, dif, sdif); +#endif +} + /* net/ipv4/udp.c */ void udp_destruct_sock(struct sock *sk); void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len); diff --git a/net/core/sock.c b/net/core/sock.c index 7e8796a6a089..2fff1796dcba 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -567,6 +567,8 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval, lock_sock(sk); sk->sk_bound_dev_if = index; + if (sk->sk_prot->rehash) + sk->sk_prot->rehash(sk); sk_dst_reset(sk); release_sock(sk); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 07d6fb80f433..2c01d52a8dd9 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -371,6 +371,7 @@ static int compute_score(struct sock *sk, struct net *net, { int score; struct inet_sock *inet; + bool dev_match; if (!net_eq(sock_net(sk), net) || udp_sk(sk)->udp_port_hash != hnum || @@ -398,15 +399,11 @@ static int compute_score(struct sock *sk, struct net *net, score += 4; } - if (sk->sk_bound_dev_if || exact_dif) { - bool dev_match = (sk->sk_bound_dev_if == dif || - sk->sk_bound_dev_if == sdif); - - if (!dev_match) - return -1; - if (sk->sk_bound_dev_if) - score += 4; - } + dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, + dif, sdif); + if (!dev_match) + return -1; + score += 4; if (sk->sk_incoming_cpu == raw_smp_processor_id()) score++; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 1ede7a16a0be..bde08aa549f3 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -772,6 +772,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, case IPV6_2292PKTINFO: { struct net_device *dev = NULL; + int src_idx; if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct in6_pktinfo))) { err = -EINVAL; @@ -779,12 +780,15 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, } src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg); + src_idx = src_info->ipi6_ifindex; - if (src_info->ipi6_ifindex) { + if (src_idx)
[PATCH net-next v5 6/9] vrf: mark skb for multicast or link-local as enslaved to VRF
The skb for packets that are multicast or to a link-local address are not marked as being enslaved to a VRF, if they are received on a socket bound to the VRF. This is needed for ND and it is preferable for the kernel not to have to deal with the additional use-cases if ll or mcast packets are handled as enslaved. However, this does not allow service instances listening on unbound and bound to VRF sockets to distinguish the VRF used, if packets are sent as multicast or to a link-local address. The fix is for the VRF driver to also mark these skb as being enslaved to the VRF. Signed-off-by: Mike Manning --- drivers/net/vrf.c | 19 +-- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 69b7227c637e..21ad4b1d7f03 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -981,24 +981,23 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { int orig_iif = skb->skb_iif; - bool need_strict; + bool need_strict = rt6_need_strict(_hdr(skb)->daddr); + bool is_ndisc = ipv6_ndisc_frame(skb); - /* loopback traffic; do not push through packet taps again. -* Reset pkt_type for upper layers to process skb + /* loopback, multicast & non-ND link-local traffic; do not push through +* packet taps again. Reset pkt_type for upper layers to process skb */ - if (skb->pkt_type == PACKET_LOOPBACK) { + if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) { skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; IP6CB(skb)->flags |= IP6SKB_L3SLAVE; - skb->pkt_type = PACKET_HOST; + if (skb->pkt_type == PACKET_LOOPBACK) + skb->pkt_type = PACKET_HOST; goto out; } - /* if packet is NDISC or addressed to multicast or link-local -* then keep the ingress interface -*/ - need_strict = rt6_need_strict(_hdr(skb)->daddr); - if (!ipv6_ndisc_frame(skb) && !need_strict) { + /* if packet is NDISC then keep the ingress interface */ + if (!is_ndisc) { vrf_rx_stats(vrf_dev, skb->len); skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; -- 2.11.0
[PATCH net-next v5 2/9] net: ensure unbound stream socket to be chosen when not in a VRF
The commit a04a480d4392 ("net: Require exact match for TCP socket lookups if dif is l3mdev") only ensures that the correct socket is selected for packets in a VRF. However, there is no guarantee that the unbound socket will be selected for packets when not in a VRF. By checking for a device match in compute_score() also for the case when there is no bound device and attaching a score to this, the unbound socket is selected. And if a failure is returned when there is no device match, this ensures that bound sockets are never selected, even if there is no unbound socket. Signed-off-by: Mike Manning --- include/net/inet_hashtables.h | 11 +++ include/net/inet_sock.h | 8 net/ipv4/inet_hashtables.c| 14 ++ net/ipv6/inet6_hashtables.c | 14 ++ 4 files changed, 31 insertions(+), 16 deletions(-) diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 4ae060b4bac2..0ce460e93dc4 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -189,6 +189,17 @@ static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) hashinfo->ehash_locks = NULL; } +static inline bool inet_sk_bound_dev_eq(struct net *net, int bound_dev_if, + int dif, int sdif) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + return inet_bound_dev_eq(!!net->ipv4.sysctl_tcp_l3mdev_accept, +bound_dev_if, dif, sdif); +#else + return inet_bound_dev_eq(true, bound_dev_if, dif, sdif); +#endif +} + struct inet_bind_bucket * inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index ed3f723af00b..e8eef85006aa 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -143,6 +143,14 @@ static inline int inet_sk_bound_l3mdev(const struct sock *sk) return 0; } +static inline bool inet_bound_dev_eq(bool l3mdev_accept, int bound_dev_if, +int dif, int sdif) +{ + if (!bound_dev_if) + return !sdif || l3mdev_accept; + return bound_dev_if == dif || bound_dev_if == sdif; +} + struct inet_cork { unsigned intflags; __be32 addr; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 260531dc6458..2ec684057ebd 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -235,6 +235,7 @@ static inline int compute_score(struct sock *sk, struct net *net, { int score = -1; struct inet_sock *inet = inet_sk(sk); + bool dev_match; if (net_eq(sock_net(sk), net) && inet->inet_num == hnum && !ipv6_only_sock(sk)) { @@ -245,15 +246,12 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score += 4; } - if (sk->sk_bound_dev_if || exact_dif) { - bool dev_match = (sk->sk_bound_dev_if == dif || - sk->sk_bound_dev_if == sdif); + dev_match = inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, +dif, sdif); + if (!dev_match) + return -1; + score += 4; - if (!dev_match) - return -1; - if (sk->sk_bound_dev_if) - score += 4; - } if (sk->sk_incoming_cpu == raw_smp_processor_id()) score++; } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 3d7c7460a0c5..5eeeba7181a1 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -99,6 +99,7 @@ static inline int compute_score(struct sock *sk, struct net *net, const int dif, const int sdif, bool exact_dif) { int score = -1; + bool dev_match; if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum && sk->sk_family == PF_INET6) { @@ -109,15 +110,12 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score++; } - if (sk->sk_bound_dev_if || exact_dif) { - bool dev_match = (sk->sk_bound_dev_if == dif || - sk->sk_bound_dev_if == sdif); + dev_match = inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, +dif, sdif); + if (!dev_match) + return -1; + score
[PATCH net-next v4 2/9] net: ensure unbound stream socket to be chosen when not in a VRF
The commit a04a480d4392 ("net: Require exact match for TCP socket lookups if dif is l3mdev") only ensures that the correct socket is selected for packets in a VRF. However, there is no guarantee that the unbound socket will be selected for packets when not in a VRF. By checking for a device match in compute_score() also for the case when there is no bound device and attaching a score to this, the unbound socket is selected. And if a failure is returned when there is no device match, this ensures that bound sockets are never selected, even if there is no unbound socket. Signed-off-by: Mike Manning --- include/net/inet_hashtables.h | 11 +++ include/net/inet_sock.h | 8 net/ipv4/inet_hashtables.c| 14 ++ net/ipv6/inet6_hashtables.c | 14 ++ 4 files changed, 31 insertions(+), 16 deletions(-) diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 4ae060b4bac2..5de2d9f24c05 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -189,6 +189,17 @@ static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) hashinfo->ehash_locks = NULL; } +static inline bool inet_sk_bound_dev_eq(struct net *net, int bound_dev_if, + int dif, int sdif) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + return inet_bound_dev_eq(net->ipv4.sysctl_tcp_l3mdev_accept, + bound_dev_if, dif, sdif); +#else + return inet_bound_dev_eq(1, bound_dev_if, dif, sdif); +#endif +} + struct inet_bind_bucket * inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index ed3f723af00b..e8eef85006aa 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -143,6 +143,14 @@ static inline int inet_sk_bound_l3mdev(const struct sock *sk) return 0; } +static inline bool inet_bound_dev_eq(bool l3mdev_accept, int bound_dev_if, +int dif, int sdif) +{ + if (!bound_dev_if) + return !sdif || l3mdev_accept; + return bound_dev_if == dif || bound_dev_if == sdif; +} + struct inet_cork { unsigned intflags; __be32 addr; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 40d722ab1738..13890d5bfc34 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -235,6 +235,7 @@ static inline int compute_score(struct sock *sk, struct net *net, { int score = -1; struct inet_sock *inet = inet_sk(sk); + bool dev_match; if (net_eq(sock_net(sk), net) && inet->inet_num == hnum && !ipv6_only_sock(sk)) { @@ -245,15 +246,12 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score += 4; } - if (sk->sk_bound_dev_if || exact_dif) { - bool dev_match = (sk->sk_bound_dev_if == dif || - sk->sk_bound_dev_if == sdif); + dev_match = inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, +dif, sdif); + if (!dev_match) + return -1; + score += 4; - if (!dev_match) - return -1; - if (sk->sk_bound_dev_if) - score += 4; - } if (sk->sk_incoming_cpu == raw_smp_processor_id()) score++; } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 3d7c7460a0c5..5eeeba7181a1 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -99,6 +99,7 @@ static inline int compute_score(struct sock *sk, struct net *net, const int dif, const int sdif, bool exact_dif) { int score = -1; + bool dev_match; if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum && sk->sk_family == PF_INET6) { @@ -109,15 +110,12 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score++; } - if (sk->sk_bound_dev_if || exact_dif) { - bool dev_match = (sk->sk_bound_dev_if == dif || - sk->sk_bound_dev_if == sdif); + dev_match = inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, +dif, sdif); + if (!dev_match) + return -1; + score
[PATCH net-next v4 6/9] vrf: mark skb for multicast or link-local as enslaved to VRF
The skb for packets that are multicast or to a link-local address are not marked as being enslaved to a VRF, if they are received on a socket bound to the VRF. This is needed for ND and it is preferable for the kernel not to have to deal with the additional use-cases if ll or mcast packets are handled as enslaved. However, this does not allow service instances listening on unbound and bound to VRF sockets to distinguish the VRF used, if packets are sent as multicast or to a link-local address. The fix is for the VRF driver to also mark these skb as being enslaved to the VRF. Signed-off-by: Mike Manning --- drivers/net/vrf.c | 19 +-- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 69b7227c637e..21ad4b1d7f03 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -981,24 +981,23 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { int orig_iif = skb->skb_iif; - bool need_strict; + bool need_strict = rt6_need_strict(_hdr(skb)->daddr); + bool is_ndisc = ipv6_ndisc_frame(skb); - /* loopback traffic; do not push through packet taps again. -* Reset pkt_type for upper layers to process skb + /* loopback, multicast & non-ND link-local traffic; do not push through +* packet taps again. Reset pkt_type for upper layers to process skb */ - if (skb->pkt_type == PACKET_LOOPBACK) { + if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) { skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; IP6CB(skb)->flags |= IP6SKB_L3SLAVE; - skb->pkt_type = PACKET_HOST; + if (skb->pkt_type == PACKET_LOOPBACK) + skb->pkt_type = PACKET_HOST; goto out; } - /* if packet is NDISC or addressed to multicast or link-local -* then keep the ingress interface -*/ - need_strict = rt6_need_strict(_hdr(skb)->daddr); - if (!ipv6_ndisc_frame(skb) && !need_strict) { + /* if packet is NDISC then keep the ingress interface */ + if (!is_ndisc) { vrf_rx_stats(vrf_dev, skb->len); skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; -- 2.11.0
[PATCH net-next v4 5/9] net: fix raw socket lookup device bind matching with VRFs
From: Duncan Eastoe When there exist a pair of raw sockets one unbound and one bound to a VRF but equal in all other respects, when a packet is received in the VRF context, __raw_v4_lookup() matches on both sockets. This results in the packet being delivered over both sockets, instead of only the raw socket bound to the VRF. The bound device checks in __raw_v4_lookup() are replaced with a call to raw_sk_bound_dev_eq() which correctly handles whether the packet should be delivered over the unbound socket in such cases. In __raw_v6_lookup() the match on the device binding of the socket is similarly updated to use raw_sk_bound_dev_eq() which matches the handling in __raw_v4_lookup(). Importantly raw_sk_bound_dev_eq() takes the raw_l3mdev_accept sysctl into account. Signed-off-by: Duncan Eastoe Signed-off-by: Mike Manning --- include/net/raw.h | 12 net/ipv4/raw.c| 3 +-- net/ipv6/raw.c| 5 ++--- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/include/net/raw.h b/include/net/raw.h index 20ebf0b3dfa8..6ed2ae5b4a80 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -18,6 +18,7 @@ #define _RAW_H +#include #include #include @@ -75,4 +76,15 @@ static inline struct raw_sock *raw_sk(const struct sock *sk) return (struct raw_sock *)sk; } +static inline bool raw_sk_bound_dev_eq(struct net *net, int bound_dev_if, + int dif, int sdif) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + return inet_bound_dev_eq(net->ipv4.sysctl_raw_l3mdev_accept, +bound_dev_if, dif, sdif); +#else + return inet_bound_dev_eq(1, bound_dev_if, dif, sdif); +#endif +} + #endif /* _RAW_H */ diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index da453c7dfb75..d42cdd018987 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -131,8 +131,7 @@ struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, if (net_eq(sock_net(sk), net) && inet->inet_num == num && !(inet->inet_daddr && inet->inet_daddr != raddr)&& !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && - !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif && - sk->sk_bound_dev_if != sdif)) + raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) goto found; /* gotcha */ } sk = NULL; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 5e0efd3954e9..aed7eb5c2123 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -86,9 +86,8 @@ struct sock *__raw_v6_lookup(struct net *net, struct sock *sk, !ipv6_addr_equal(>sk_v6_daddr, rmt_addr)) continue; - if (sk->sk_bound_dev_if && - sk->sk_bound_dev_if != dif && - sk->sk_bound_dev_if != sdif) + if (!raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, +dif, sdif)) continue; if (!ipv6_addr_any(>sk_v6_rcv_saddr)) { -- 2.11.0
[PATCH net-next v4 0/9] vrf: allow simultaneous service instances in default and other VRFs
Services currently have to be VRF-aware if they are using an unbound socket. One cannot have multiple service instances running in the default and other VRFs for services that are not VRF-aware and listen on an unbound socket. This is because there is no easy way of isolating packets received in the default VRF from those arriving in other VRFs. This series provides this isolation for stream sockets subject to the existing kernel parameter net.ipv4.tcp_l3mdev_accept not being set, given that this is documented as allowing a single service instance to work across all VRF domains. Similarly, net.ipv4.udp_l3mdev_accept is checked for datagram sockets, and net.ipv4.raw_l3mdev_accept is introduced for raw sockets. The functionality applies to UDP & TCP services as well as those using raw sockets, and is for IPv4 and IPv6. Example of running ssh instances in default and blue VRF: $ /usr/sbin/sshd -D $ ip vrf exec vrf-blue /usr/sbin/sshd $ ss -ta | egrep 'State|ssh' State Recv-Q Send-Q Local Address:Port Peer Address:Port LISTEN 0128 0.0.0.0%vrf-blue:ssh 0.0.0.0:* LISTEN 01280.0.0.0:ssh 0.0.0.0:* ESTAB 00 192.168.122.220:ssh 192.168.122.1:50282 LISTEN 0128 [::]%vrf-blue:ssh[::]:* LISTEN 0128 [::]:ssh[::]:* ESTAB 00 [3000::2]%vrf-blue:ssh [3000::9]:45896 ESTAB 00[2000::2]:ssh [2000::9]:46398 v1: - Address Paolo Abeni's comments (patch 4/5) - Fix build when CONFIG_NET_L3_MASTER_DEV not defined (patch 1/5) v2: - Address David Aherns' comments (patches 4/5 and 5/5) - Remove patches 3/5 and 5/5 from series for individual submissions - Include a sysctl for raw sockets as recommended by David Ahern - Expand series into 10 patches and provide improved descriptions v3: - Update description for patch 1/10 and remove patch 6/10 v4: - Set default to enabled for raw socket sysctl as recommended by David Ahern Dewi Morgan (1): ipv6: do not drop vrf udp multicast packets Duncan Eastoe (1): net: fix raw socket lookup device bind matching with VRFs Mike Manning (6): net: ensure unbound stream socket to be chosen when not in a VRF net: ensure unbound datagram socket to be chosen when not in a VRF net: provide a sysctl raw_l3mdev_accept for raw socket lookup with VRFs vrf: mark skb for multicast or link-local as enslaved to VRF ipv6: allow ping to link-local address in VRF ipv6: handling of multicast packets received in VRF Robert Shearman (1): net: allow binding socket in a VRF when there's an unbound socket Documentation/networking/ip-sysctl.txt | 12 Documentation/networking/vrf.txt | 22 + drivers/net/vrf.c | 19 +- include/net/inet6_hashtables.h | 5 ++--- include/net/inet_hashtables.h | 24 --- include/net/inet_sock.h| 21 include/net/netns/ipv4.h | 3 +++ include/net/raw.h | 13 + include/net/udp.h | 11 +++ net/core/sock.c| 2 ++ net/ipv4/af_inet.c | 2 ++ net/ipv4/inet_connection_sock.c| 13 ++--- net/ipv4/inet_hashtables.c | 34 - net/ipv4/raw.c | 19 ++ net/ipv4/sysctl_net_ipv4.c | 11 +++ net/ipv4/udp.c | 15 ++- net/ipv6/datagram.c| 5 - net/ipv6/inet6_hashtables.c| 14 ++ net/ipv6/ip6_input.c | 35 +++--- net/ipv6/ipv6_sockglue.c | 2 +- net/ipv6/raw.c | 5 ++--- net/ipv6/udp.c | 22 ++--- 22 files changed, 228 insertions(+), 81 deletions(-) -- 2.11.0
[PATCH net-next v4 7/9] ipv6: allow ping to link-local address in VRF
If link-local packets are marked as enslaved to a VRF, then to allow ping to the link-local from a vrf, the error handling for IPV6_PKTINFO needs to be relaxed to also allow the pkt ipi6_ifindex to be that of a slave device to the vrf. Note that the real device also needs to be retrieved in icmp6_iif() to set the ipv6 flow oif to this for icmp echo reply handling. The recent commit 24b711edfc34 ("net/ipv6: Fix linklocal to global address with VRF") takes care of this, so the sdif does not need checking here. This fix makes ping to link-local consistent with that to global addresses, in that this can now be done from within the same VRF that the address is in. Signed-off-by: Mike Manning --- net/ipv6/ipv6_sockglue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 381ce38940ae..973e215c3114 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -486,7 +486,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, retv = -EFAULT; break; } - if (sk->sk_bound_dev_if && pkt.ipi6_ifindex != sk->sk_bound_dev_if) + if (!sk_dev_equal_l3scope(sk, pkt.ipi6_ifindex)) goto e_inval; np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex; -- 2.11.0
[PATCH net-next v4 8/9] ipv6: handling of multicast packets received in VRF
If the skb for multicast packets marked as enslaved to a VRF are received, then the secondary device index should be used to obtain the real device. And verify the multicast address against the enslaved rather than the l3mdev device. Signed-off-by: Dewi Morgan Signed-off-by: Mike Manning --- net/ipv6/ip6_input.c | 35 --- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 96577e742afd..df58e1100226 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -359,6 +359,8 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk } } else if (ipprot->flags & INET6_PROTO_FINAL) { const struct ipv6hdr *hdr; + int sdif = inet6_sdif(skb); + struct net_device *dev; /* Only do this once for first final protocol */ have_final = true; @@ -371,9 +373,19 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); hdr = ipv6_hdr(skb); + + /* skb->dev passed may be master dev for vrfs. */ + if (sdif) { + dev = dev_get_by_index_rcu(net, sdif); + if (!dev) + goto discard; + } else { + dev = skb->dev; + } + if (ipv6_addr_is_multicast(>daddr) && - !ipv6_chk_mcast_addr(skb->dev, >daddr, - >saddr) && + !ipv6_chk_mcast_addr(dev, >daddr, +>saddr) && !ipv6_is_mld(skb, nexthdr, skb_network_header_len(skb))) goto discard; } @@ -432,15 +444,32 @@ EXPORT_SYMBOL_GPL(ip6_input); int ip6_mc_input(struct sk_buff *skb) { + int sdif = inet6_sdif(skb); const struct ipv6hdr *hdr; + struct net_device *dev; bool deliver; __IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev), __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST, skb->len); + /* skb->dev passed may be master dev for vrfs. */ + if (sdif) { + rcu_read_lock(); + dev = dev_get_by_index_rcu(dev_net(skb->dev), sdif); + if (!dev) { + rcu_read_unlock(); + kfree_skb(skb); + return -ENODEV; + } + } else { + dev = skb->dev; + } + hdr = ipv6_hdr(skb); - deliver = ipv6_chk_mcast_addr(skb->dev, >daddr, NULL); + deliver = ipv6_chk_mcast_addr(dev, >daddr, NULL); + if (sdif) + rcu_read_unlock(); #ifdef CONFIG_IPV6_MROUTE /* -- 2.11.0
[PATCH net-next v4 4/9] net: provide a sysctl raw_l3mdev_accept for raw socket lookup with VRFs
Add a sysctl raw_l3mdev_accept to control raw socket lookup in a manner similar to use of tcp_l3mdev_accept for stream and of udp_l3mdev_accept for datagram sockets. Have this default to enabled for reasons of backwards compatibility. This is so as to specify the output device with cmsg and IP_PKTINFO, but using a socket not bound to the corresponding VRF. This allows e.g. older ping implementations to be run with specifying the device but without executing it in the VRF. If the option is disabled, packets received in a VRF context are only handled by a raw socket bound to the VRF, and correspondingly packets in the default VRF are only handled by a socket not bound to any VRF. Signed-off-by: Mike Manning --- Documentation/networking/ip-sysctl.txt | 12 Documentation/networking/vrf.txt | 13 + include/net/netns/ipv4.h | 3 +++ include/net/raw.h | 1 + net/ipv4/af_inet.c | 2 ++ net/ipv4/raw.c | 16 ++-- net/ipv4/sysctl_net_ipv4.c | 11 +++ 7 files changed, 56 insertions(+), 2 deletions(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 32b21571adfe..aa9e6a331679 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -370,6 +370,7 @@ tcp_l3mdev_accept - BOOLEAN derived from the listen socket to be bound to the L3 domain in which the packets originated. Only valid when the kernel was compiled with CONFIG_NET_L3_MASTER_DEV. +Default: 0 (disabled) tcp_low_latency - BOOLEAN This is a legacy option, it has no effect anymore. @@ -773,6 +774,7 @@ udp_l3mdev_accept - BOOLEAN being received regardless of the L3 domain in which they originated. Only valid when the kernel was compiled with CONFIG_NET_L3_MASTER_DEV. +Default: 0 (disabled) udp_mem - vector of 3 INTEGERs: min, pressure, max Number of pages allowed for queueing by all UDP sockets. @@ -799,6 +801,16 @@ udp_wmem_min - INTEGER total pages of UDP sockets exceed udp_mem pressure. The unit is byte. Default: 4K +RAW variables: + +raw_l3mdev_accept - BOOLEAN + Enabling this option allows a "global" bound socket to work + across L3 master domains (e.g., VRFs) with packets capable of + being received regardless of the L3 domain in which they + originated. Only valid when the kernel was compiled with + CONFIG_NET_L3_MASTER_DEV. + Default: 1 (enabled) + CIPSOv4 Variables: cipso_cache_enable - BOOLEAN diff --git a/Documentation/networking/vrf.txt b/Documentation/networking/vrf.txt index d4b129402d57..d234c9750c72 100644 --- a/Documentation/networking/vrf.txt +++ b/Documentation/networking/vrf.txt @@ -111,9 +111,22 @@ the same port if they bind to an l3mdev. TCP & UDP services running in the default VRF context (ie., not bound to any VRF device) can work across all VRF domains by enabling the tcp_l3mdev_accept and udp_l3mdev_accept sysctl options: + sysctl -w net.ipv4.tcp_l3mdev_accept=1 sysctl -w net.ipv4.udp_l3mdev_accept=1 +These options are disabled by default so that a socket in a VRF is only +selected for packets in that VRF. There is a similar option for RAW +sockets, which is enabled by default for reasons of backwards compatibility. +This is so as to specify the output device with cmsg and IP_PKTINFO, but +using a socket not bound to the corresponding VRF. This allows e.g. older ping +implementations to be run with specifying the device but without executing it +in the VRF. This option can be disabled so that packets received in a VRF +context are only handled by a raw socket bound to the VRF, and packets in the +in the default VRF are only handled by a socket not bound to any VRF: + +sysctl -w net.ipv4.raw_l3mdev_accept=0 + netfilter rules on the VRF device can be used to limit access to services running in the default VRF context as well. diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index e47503b4e4d1..104a6669e344 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -103,6 +103,9 @@ struct netns_ipv4 { /* Shall we try to damage output packets if routing dev changes? */ int sysctl_ip_dynaddr; int sysctl_ip_early_demux; +#ifdef CONFIG_NET_L3_MASTER_DEV + int sysctl_raw_l3mdev_accept; +#endif int sysctl_tcp_early_demux; int sysctl_udp_early_demux; diff --git a/include/net/raw.h b/include/net/raw.h index 9c9fa98a91a4..20ebf0b3dfa8 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -61,6 +61,7 @@ void raw_seq_stop(struct seq_file *seq, void *v); int raw_hash_sk(struct sock *sk); void raw_unhash_sk(struct sock *sk); +void raw_init(void); struct raw_sock { /* inet_sock has to be the first member */ diff --git a/net/ipv4/af_inet.c b/net/
[PATCH net-next v4 9/9] ipv6: do not drop vrf udp multicast packets
From: Dewi Morgan For bound udp sockets in a vrf, also check the sdif to get the index for ingress devices enslaved to an l3mdev. Signed-off-by: Dewi Morgan Signed-off-by: Mike Manning --- net/ipv6/udp.c | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 0559adc2f357..a25571c12a8a 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -637,7 +637,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk, __be16 loc_port, const struct in6_addr *loc_addr, __be16 rmt_port, const struct in6_addr *rmt_addr, - int dif, unsigned short hnum) + int dif, int sdif, unsigned short hnum) { struct inet_sock *inet = inet_sk(sk); @@ -649,7 +649,7 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk, (inet->inet_dport && inet->inet_dport != rmt_port) || (!ipv6_addr_any(>sk_v6_daddr) && !ipv6_addr_equal(>sk_v6_daddr, rmt_addr)) || - (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) || + !udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif) || (!ipv6_addr_any(>sk_v6_rcv_saddr) && !ipv6_addr_equal(>sk_v6_rcv_saddr, loc_addr))) return false; @@ -683,6 +683,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, unsigned int offset = offsetof(typeof(*sk), sk_node); unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); int dif = inet6_iif(skb); + int sdif = inet6_sdif(skb); struct hlist_node *node; struct sk_buff *nskb; @@ -697,7 +698,8 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, sk_for_each_entry_offset_rcu(sk, node, >head, offset) { if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr, - uh->source, saddr, dif, hnum)) + uh->source, saddr, dif, sdif, + hnum)) continue; /* If zero checksum and no_check is not on for * the socket then skip it. -- 2.11.0
[PATCH net-next v4 3/9] net: ensure unbound datagram socket to be chosen when not in a VRF
Ensure an unbound datagram skt is chosen when not in a VRF. The check for a device match in compute_score() for UDP must be performed when there is no device match. For this, a failure is returned when there is no device match. This ensures that bound sockets are never selected, even if there is no unbound socket. Allow IPv6 packets to be sent over a datagram skt bound to a VRF. These packets are currently blocked, as flowi6_oif was set to that of the master vrf device, and the ipi6_ifindex is that of the slave device. Allow these packets to be sent by checking the device with ipi6_ifindex has the same L3 scope as that of the bound device of the skt, which is the master vrf device. Note that this check always succeeds if the skt is unbound. Even though the right datagram skt is now selected by compute_score(), a different skt is being returned that is bound to the wrong vrf. The difference between these and stream sockets is the handling of the skt option for SO_REUSEPORT. While the handling when adding a skt for reuse correctly checks that the bound device of the skt is a match, the skts in the hashslot are already incorrect. So for the same hash, a skt for the wrong vrf may be selected for the required port. The root cause is that the skt is immediately placed into a slot when it is created, but when the skt is then bound using SO_BINDTODEVICE, it remains in the same slot. The solution is to move the skt to the correct slot by forcing a rehash. Signed-off-by: Mike Manning --- include/net/udp.h | 11 +++ net/core/sock.c | 2 ++ net/ipv4/udp.c | 15 ++- net/ipv6/datagram.c | 5 - net/ipv6/udp.c | 14 +- 5 files changed, 28 insertions(+), 19 deletions(-) diff --git a/include/net/udp.h b/include/net/udp.h index 9e82cb391dea..057972d0eea5 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -252,6 +252,17 @@ static inline int udp_rqueue_get(struct sock *sk) return sk_rmem_alloc_get(sk) - READ_ONCE(udp_sk(sk)->forward_deficit); } +static inline bool udp_sk_bound_dev_eq(struct net *net, int bound_dev_if, + int dif, int sdif) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + return inet_bound_dev_eq(net->ipv4.sysctl_udp_l3mdev_accept, +bound_dev_if, dif, sdif); +#else + return inet_bound_dev_eq(1, bound_dev_if, dif, sdif); +#endif +} + /* net/ipv4/udp.c */ void udp_destruct_sock(struct sock *sk); void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len); diff --git a/net/core/sock.c b/net/core/sock.c index 6fcc4bc07d19..6eda848192aa 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -567,6 +567,8 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval, lock_sock(sk); sk->sk_bound_dev_if = index; + if (sk->sk_prot->rehash) + sk->sk_prot->rehash(sk); sk_dst_reset(sk); release_sock(sk); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1976fddb9e00..cf73c9194bb6 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -371,6 +371,7 @@ static int compute_score(struct sock *sk, struct net *net, { int score; struct inet_sock *inet; + bool dev_match; if (!net_eq(sock_net(sk), net) || udp_sk(sk)->udp_port_hash != hnum || @@ -398,15 +399,11 @@ static int compute_score(struct sock *sk, struct net *net, score += 4; } - if (sk->sk_bound_dev_if || exact_dif) { - bool dev_match = (sk->sk_bound_dev_if == dif || - sk->sk_bound_dev_if == sdif); - - if (!dev_match) - return -1; - if (sk->sk_bound_dev_if) - score += 4; - } + dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, + dif, sdif); + if (!dev_match) + return -1; + score += 4; if (sk->sk_incoming_cpu == raw_smp_processor_id()) score++; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 1ede7a16a0be..4813293d4fad 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -782,7 +782,10 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, if (src_info->ipi6_ifindex) { if (fl6->flowi6_oif && - src_info->ipi6_ifindex != fl6->flowi6_oif) + src_info->ipi6_ifindex != fl6->flowi6_oif && + (sk->sk_bound_dev_if != fl6->flowi6_oif || +!sk_dev_equal_l3scope( +sk, src_info->ipi6_ifindex))) return -EINVAL; fl6->flowi6_oif = src_info->
[PATCH net-next v4 1/9] net: allow binding socket in a VRF when there's an unbound socket
From: Robert Shearman Change the inet socket lookup to avoid packets arriving on a device enslaved to an l3mdev from matching unbound sockets by removing the wildcard for non sk_bound_dev_if and instead relying on check against the secondary device index, which will be 0 when the input device is not enslaved to an l3mdev and so match against an unbound socket and not match when the input device is enslaved. Change the socket binding to take the l3mdev into account to allow an unbound socket to not conflict sockets bound to an l3mdev given the datapath isolation now guaranteed. Signed-off-by: Robert Shearman Signed-off-by: Mike Manning --- Documentation/networking/vrf.txt | 9 + include/net/inet6_hashtables.h | 5 ++--- include/net/inet_hashtables.h| 13 ++--- include/net/inet_sock.h | 13 + net/ipv4/inet_connection_sock.c | 13 ++--- net/ipv4/inet_hashtables.c | 20 +++- 6 files changed, 51 insertions(+), 22 deletions(-) diff --git a/Documentation/networking/vrf.txt b/Documentation/networking/vrf.txt index 8ff7b4c8f91b..d4b129402d57 100644 --- a/Documentation/networking/vrf.txt +++ b/Documentation/networking/vrf.txt @@ -103,6 +103,11 @@ VRF device: or to specify the output device using cmsg and IP_PKTINFO. +By default the scope of the port bindings for unbound sockets is +limited to the default VRF. That is, it will not be matched by packets +arriving on interfaces enslaved to an l3mdev and processes may bind to +the same port if they bind to an l3mdev. + TCP & UDP services running in the default VRF context (ie., not bound to any VRF device) can work across all VRF domains by enabling the tcp_l3mdev_accept and udp_l3mdev_accept sysctl options: @@ -112,10 +117,6 @@ tcp_l3mdev_accept and udp_l3mdev_accept sysctl options: netfilter rules on the VRF device can be used to limit access to services running in the default VRF context as well. -The default VRF does not have limited scope with respect to port bindings. -That is, if a process does a wildcard bind to a port in the default VRF it -owns the port across all VRF domains within the network namespace. - Using iproute2 for VRFs diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 6e91e38a31da..9db98af46985 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -115,9 +115,8 @@ int inet6_hash(struct sock *sk); ((__sk)->sk_family == AF_INET6)&& \ ipv6_addr_equal(&(__sk)->sk_v6_daddr, (__saddr)) && \ ipv6_addr_equal(&(__sk)->sk_v6_rcv_saddr, (__daddr)) && \ -(!(__sk)->sk_bound_dev_if || \ - ((__sk)->sk_bound_dev_if == (__dif)) || \ - ((__sk)->sk_bound_dev_if == (__sdif))) && \ +(((__sk)->sk_bound_dev_if == (__dif)) || \ + ((__sk)->sk_bound_dev_if == (__sdif)))&& \ net_eq(sock_net(__sk), (__net))) #endif /* _INET6_HASHTABLES_H */ diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 9141e95529e7..4ae060b4bac2 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -79,6 +79,7 @@ struct inet_ehash_bucket { struct inet_bind_bucket { possible_net_t ib_net; + int l3mdev; unsigned short port; signed char fastreuse; signed char fastreuseport; @@ -191,7 +192,7 @@ static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) struct inet_bind_bucket * inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, - const unsigned short snum); + const unsigned short snum, int l3mdev); void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb); @@ -282,9 +283,8 @@ static inline struct sock *inet_lookup_listener(struct net *net, #define INET_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif, __sdif) \ (((__sk)->sk_portpair == (__ports)) && \ ((__sk)->sk_addrpair == (__cookie))&& \ -(!(__sk)->sk_bound_dev_if || \ - ((__sk)->sk_bound_dev_if == (__dif)) || \ - ((__sk)->sk_bound_dev_if == (__sdif))) && \ +(((__sk)->sk_bound_dev_if == (__dif)) || \ + ((__sk)->sk_bound_dev_if == (__sdif)))&& \
[PATCH] net: allow traceroute with a specified interface in a vrf
Traceroute executed in a vrf succeeds if no device is given or if the vrf is given as the device, but fails if the interface is given as the device. This is for default UDP probes, it succeeds for TCP SYN or ICMP ECHO probes. As the skb bound dev is the interface and the sk dev is the vrf, sk lookup fails for ICMP_DEST_UNREACH and ICMP_TIME_EXCEEDED messages. The solution is for the secondary dev to be passed so that the interface is available for the device match to succeed, in the same way as is already done for non-error cases. Signed-off-by: Mike Manning --- net/ipv4/udp.c | 4 ++-- net/ipv6/udp.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1f5e78d1477d..c9bc08915153 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -676,8 +676,8 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) struct net *net = dev_net(skb->dev); sk = __udp4_lib_lookup(net, iph->daddr, uh->dest, - iph->saddr, uh->source, skb->dev->ifindex, 0, - udptable, NULL); + iph->saddr, uh->source, skb->dev->ifindex, + inet_sdif(skb), udptable, NULL); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return; /* No socket for error */ diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 4f0a8728d723..740be1fbd4f5 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -543,7 +543,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct net *net = dev_net(skb->dev); sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source, - inet6_iif(skb), 0, udptable, skb); + inet6_iif(skb), inet6_sdif(skb), udptable, skb); if (!sk) { __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); -- 2.11.0
[PATCH net-next v3 4/9] net: provide a sysctl raw_l3mdev_accept for raw socket lookup with VRFs
Add a sysctl raw_l3mdev_accept to control raw socket lookup in a manner similar to use of tcp_l3mdev_accept for stream and of udp_l3mdev_accept for datagram sockets. Have this default to off as this is what users expect, given that there is no explicit mechanism to set unmodified VRF-unaware application into a default VRF. Signed-off-by: Mike Manning --- Documentation/networking/ip-sysctl.txt | 9 + Documentation/networking/vrf.txt | 8 +--- include/net/netns/ipv4.h | 3 +++ net/ipv4/sysctl_net_ipv4.c | 11 +++ 4 files changed, 28 insertions(+), 3 deletions(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 8313a636dd53..a46be4a5b7a0 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -788,6 +788,15 @@ udp_wmem_min - INTEGER total pages of UDP sockets exceed udp_mem pressure. The unit is byte. Default: 4K +RAW variables: + +raw_l3mdev_accept - BOOLEAN + Enabling this option allows a "global" bound socket to work + across L3 master domains (e.g., VRFs) with packets capable of + being received regardless of the L3 domain in which they + originated. Only valid when the kernel was compiled with + CONFIG_NET_L3_MASTER_DEV. + CIPSOv4 Variables: cipso_cache_enable - BOOLEAN diff --git a/Documentation/networking/vrf.txt b/Documentation/networking/vrf.txt index d4b129402d57..deb798342f1e 100644 --- a/Documentation/networking/vrf.txt +++ b/Documentation/networking/vrf.txt @@ -108,11 +108,13 @@ limited to the default VRF. That is, it will not be matched by packets arriving on interfaces enslaved to an l3mdev and processes may bind to the same port if they bind to an l3mdev. -TCP & UDP services running in the default VRF context (ie., not bound -to any VRF device) can work across all VRF domains by enabling the -tcp_l3mdev_accept and udp_l3mdev_accept sysctl options: +TCP & UDP services & services using RAW sockets that are running in the +default VRF context (ie., not bound to any VRF device) can work across +all VRF domains by enabling the tcp_l3mdev_accept, udp_l3mdev_accept and +raw_l3mdev_accept sysctl options: sysctl -w net.ipv4.tcp_l3mdev_accept=1 sysctl -w net.ipv4.udp_l3mdev_accept=1 +sysctl -w net.ipv4.raw_l3mdev_accept=1 netfilter rules on the VRF device can be used to limit access to services running in the default VRF context as well. diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index e47503b4e4d1..104a6669e344 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -103,6 +103,9 @@ struct netns_ipv4 { /* Shall we try to damage output packets if routing dev changes? */ int sysctl_ip_dynaddr; int sysctl_ip_early_demux; +#ifdef CONFIG_NET_L3_MASTER_DEV + int sysctl_raw_l3mdev_accept; +#endif int sysctl_tcp_early_demux; int sysctl_udp_early_demux; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index b92f422f2fa8..d173337040ee 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -601,6 +601,17 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = ipv4_ping_group_range, }, +#ifdef CONFIG_NET_L3_MASTER_DEV + { + .procname = "raw_l3mdev_accept", + .data = _net.ipv4.sysctl_raw_l3mdev_accept, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = , + .extra2 = , + }, +#endif { .procname = "tcp_ecn", .data = _net.ipv4.sysctl_tcp_ecn, -- 2.11.0
[PATCH net-next v3 2/9] net: ensure unbound stream socket to be chosen when not in a VRF
The commit a04a480d4392 ("net: Require exact match for TCP socket lookups if dif is l3mdev") only ensures that the correct socket is selected for packets in a VRF. However, there is no guarantee that the unbound socket will be selected for packets when not in a VRF. By checking for a device match in compute_score() also for the case when there is no bound device and attaching a score to this, the unbound socket is selected. And if a failure is returned when there is no device match, this ensures that bound sockets are never selected, even if there is no unbound socket. Signed-off-by: Mike Manning --- include/net/inet_hashtables.h | 11 +++ include/net/inet_sock.h | 8 net/ipv4/inet_hashtables.c| 14 ++ net/ipv6/inet6_hashtables.c | 14 ++ 4 files changed, 31 insertions(+), 16 deletions(-) diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 4ae060b4bac2..5de2d9f24c05 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -189,6 +189,17 @@ static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) hashinfo->ehash_locks = NULL; } +static inline bool inet_sk_bound_dev_eq(struct net *net, int bound_dev_if, + int dif, int sdif) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + return inet_bound_dev_eq(net->ipv4.sysctl_tcp_l3mdev_accept, + bound_dev_if, dif, sdif); +#else + return inet_bound_dev_eq(1, bound_dev_if, dif, sdif); +#endif +} + struct inet_bind_bucket * inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 92e0aa3958f6..47c03ea989ad 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -143,6 +143,14 @@ static inline int inet_sk_bound_l3mdev(const struct sock *sk) return 0; } +static inline bool inet_bound_dev_eq(bool l3mdev_accept, int bound_dev_if, +int dif, int sdif) +{ + if (!bound_dev_if) + return !sdif || l3mdev_accept; + return bound_dev_if == dif || bound_dev_if == sdif; +} + static inline struct ip_options_rcu *ireq_opt_deref(const struct inet_request_sock *ireq) { return rcu_dereference_check(ireq->ireq_opt, diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 260531dc6458..2ec684057ebd 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -235,6 +235,7 @@ static inline int compute_score(struct sock *sk, struct net *net, { int score = -1; struct inet_sock *inet = inet_sk(sk); + bool dev_match; if (net_eq(sock_net(sk), net) && inet->inet_num == hnum && !ipv6_only_sock(sk)) { @@ -245,15 +246,12 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score += 4; } - if (sk->sk_bound_dev_if || exact_dif) { - bool dev_match = (sk->sk_bound_dev_if == dif || - sk->sk_bound_dev_if == sdif); + dev_match = inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, +dif, sdif); + if (!dev_match) + return -1; + score += 4; - if (!dev_match) - return -1; - if (sk->sk_bound_dev_if) - score += 4; - } if (sk->sk_incoming_cpu == raw_smp_processor_id()) score++; } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 3d7c7460a0c5..5eeeba7181a1 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -99,6 +99,7 @@ static inline int compute_score(struct sock *sk, struct net *net, const int dif, const int sdif, bool exact_dif) { int score = -1; + bool dev_match; if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum && sk->sk_family == PF_INET6) { @@ -109,15 +110,12 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score++; } - if (sk->sk_bound_dev_if || exact_dif) { - bool dev_match = (sk->sk_bound_dev_if == dif || - sk->sk_bound_dev_if == sdif); + dev_match = inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, +dif, sdif); + if (!dev_match) +
[PATCH net-next v3 3/9] net: ensure unbound datagram socket to be chosen when not in a VRF
Ensure an unbound datagram skt is chosen when not in a VRF. The check for a device match in compute_score() for UDP must be performed when there is no device match. For this, a failure is returned when there is no device match. This ensures that bound sockets are never selected, even if there is no unbound socket. Allow IPv6 packets to be sent over a datagram skt bound to a VRF. These packets are currently blocked, as flowi6_oif was set to that of the master vrf device, and the ipi6_ifindex is that of the slave device. Allow these packets to be sent by checking the device with ipi6_ifindex has the same L3 scope as that of the bound device of the skt, which is the master vrf device. Note that this check always succeeds if the skt is unbound. Even though the right datagram skt is now selected by compute_score(), a different skt is being returned that is bound to the wrong vrf. The difference between these and stream sockets is the handling of the skt option for SO_REUSEPORT. While the handling when adding a skt for reuse correctly checks that the bound device of the skt is a match, the skts in the hashslot are already incorrect. So for the same hash, a skt for the wrong vrf may be selected for the required port. The root cause is that the skt is immediately placed into a slot when it is created, but when the skt is then bound using SO_BINDTODEVICE, it remains in the same slot. The solution is to move the skt to the correct slot by forcing a rehash. Signed-off-by: Mike Manning --- include/net/udp.h | 11 +++ net/core/sock.c | 2 ++ net/ipv4/udp.c | 15 ++- net/ipv6/datagram.c | 5 - net/ipv6/udp.c | 14 +- 5 files changed, 28 insertions(+), 19 deletions(-) diff --git a/include/net/udp.h b/include/net/udp.h index 8482a990b0bb..1e4fb8feaf50 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -252,6 +252,17 @@ static inline int udp_rqueue_get(struct sock *sk) return sk_rmem_alloc_get(sk) - READ_ONCE(udp_sk(sk)->forward_deficit); } +static inline bool udp_sk_bound_dev_eq(struct net *net, int bound_dev_if, + int dif, int sdif) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + return inet_bound_dev_eq(net->ipv4.sysctl_udp_l3mdev_accept, +bound_dev_if, dif, sdif); +#else + return inet_bound_dev_eq(1, bound_dev_if, dif, sdif); +#endif +} + /* net/ipv4/udp.c */ void udp_destruct_sock(struct sock *sk); void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len); diff --git a/net/core/sock.c b/net/core/sock.c index 3730eb855095..da1cbb88a6bf 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -567,6 +567,8 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval, lock_sock(sk); sk->sk_bound_dev_if = index; + if (sk->sk_prot->rehash) + sk->sk_prot->rehash(sk); sk_dst_reset(sk); release_sock(sk); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3386b3b0218c..0559a7f4c83a 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -371,6 +371,7 @@ static int compute_score(struct sock *sk, struct net *net, { int score; struct inet_sock *inet; + bool dev_match; if (!net_eq(sock_net(sk), net) || udp_sk(sk)->udp_port_hash != hnum || @@ -398,15 +399,11 @@ static int compute_score(struct sock *sk, struct net *net, score += 4; } - if (sk->sk_bound_dev_if || exact_dif) { - bool dev_match = (sk->sk_bound_dev_if == dif || - sk->sk_bound_dev_if == sdif); - - if (!dev_match) - return -1; - if (sk->sk_bound_dev_if) - score += 4; - } + dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, + dif, sdif); + if (!dev_match) + return -1; + score += 4; if (sk->sk_incoming_cpu == raw_smp_processor_id()) score++; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 1ede7a16a0be..4813293d4fad 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -782,7 +782,10 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, if (src_info->ipi6_ifindex) { if (fl6->flowi6_oif && - src_info->ipi6_ifindex != fl6->flowi6_oif) + src_info->ipi6_ifindex != fl6->flowi6_oif && + (sk->sk_bound_dev_if != fl6->flowi6_oif || +!sk_dev_equal_l3scope( +sk, src_info->ipi6_ifindex))) return -EINVAL; fl6->flowi6_oif = src_info->
[PATCH net-next v3 5/9] net: fix raw socket lookup device bind matching with VRFs
From: Duncan Eastoe When there exist a pair of raw sockets one unbound and one bound to a VRF but equal in all other respects, when a packet is received in the VRF context, __raw_v4_lookup() matches on both sockets. This results in the packet being delivered over both sockets, instead of only the raw socket bound to the VRF. The bound device checks in __raw_v4_lookup() are replaced with a call to raw_sk_bound_dev_eq() which correctly handles whether the packet should be delivered over the unbound socket in such cases. In __raw_v6_lookup() the match on the device binding of the socket is similarly updated to use raw_sk_bound_dev_eq() which matches the handling in __raw_v4_lookup(). Importantly raw_sk_bound_dev_eq() takes the raw_l3mdev_accept sysctl into account. Signed-off-by: Duncan Eastoe Signed-off-by: Mike Manning --- include/net/raw.h | 12 net/ipv4/raw.c| 3 +-- net/ipv6/raw.c| 5 ++--- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/include/net/raw.h b/include/net/raw.h index 9c9fa98a91a4..ce88fdd68933 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -18,6 +18,7 @@ #define _RAW_H +#include #include #include @@ -74,4 +75,15 @@ static inline struct raw_sock *raw_sk(const struct sock *sk) return (struct raw_sock *)sk; } +static inline bool raw_sk_bound_dev_eq(struct net *net, int bound_dev_if, + int dif, int sdif) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + return inet_bound_dev_eq(net->ipv4.sysctl_raw_l3mdev_accept, +bound_dev_if, dif, sdif); +#else + return inet_bound_dev_eq(1, bound_dev_if, dif, sdif); +#endif +} + #endif /* _RAW_H */ diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 8ca3eb06ba04..61f3559407a6 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -131,8 +131,7 @@ struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, if (net_eq(sock_net(sk), net) && inet->inet_num == num && !(inet->inet_daddr && inet->inet_daddr != raddr)&& !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && - !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif && - sk->sk_bound_dev_if != sdif)) + raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) goto found; /* gotcha */ } sk = NULL; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 413d98bf24f4..86978784fbb5 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -86,9 +86,8 @@ struct sock *__raw_v6_lookup(struct net *net, struct sock *sk, !ipv6_addr_equal(>sk_v6_daddr, rmt_addr)) continue; - if (sk->sk_bound_dev_if && - sk->sk_bound_dev_if != dif && - sk->sk_bound_dev_if != sdif) + if (!raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, +dif, sdif)) continue; if (!ipv6_addr_any(>sk_v6_rcv_saddr)) { -- 2.11.0
[PATCH net-next v3 9/9] ipv6: do not drop vrf udp multicast packets
From: Dewi Morgan For bound udp sockets in a vrf, also check the sdif to get the index for ingress devices enslaved to an l3mdev. Signed-off-by: Dewi Morgan Signed-off-by: Mike Manning --- net/ipv6/udp.c | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 6722490c87b9..821fdc31dbc0 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -637,7 +637,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk, __be16 loc_port, const struct in6_addr *loc_addr, __be16 rmt_port, const struct in6_addr *rmt_addr, - int dif, unsigned short hnum) + int dif, int sdif, unsigned short hnum) { struct inet_sock *inet = inet_sk(sk); @@ -649,7 +649,7 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk, (inet->inet_dport && inet->inet_dport != rmt_port) || (!ipv6_addr_any(>sk_v6_daddr) && !ipv6_addr_equal(>sk_v6_daddr, rmt_addr)) || - (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) || + !udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif) || (!ipv6_addr_any(>sk_v6_rcv_saddr) && !ipv6_addr_equal(>sk_v6_rcv_saddr, loc_addr))) return false; @@ -683,6 +683,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, unsigned int offset = offsetof(typeof(*sk), sk_node); unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); int dif = inet6_iif(skb); + int sdif = inet6_sdif(skb); struct hlist_node *node; struct sk_buff *nskb; @@ -697,7 +698,8 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, sk_for_each_entry_offset_rcu(sk, node, >head, offset) { if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr, - uh->source, saddr, dif, hnum)) + uh->source, saddr, dif, sdif, + hnum)) continue; /* If zero checksum and no_check is not on for * the socket then skip it. -- 2.11.0
[PATCH net-next v3 1/9] net: allow binding socket in a VRF when there's an unbound socket
From: Robert Shearman Change the inet socket lookup to avoid packets arriving on a device enslaved to an l3mdev from matching unbound sockets by removing the wildcard for non sk_bound_dev_if and instead relying on check against the secondary device index, which will be 0 when the input device is not enslaved to an l3mdev and so match against an unbound socket and not match when the input device is enslaved. Change the socket binding to take the l3mdev into account to allow an unbound socket to not conflict sockets bound to an l3mdev given the datapath isolation now guaranteed. Signed-off-by: Robert Shearman Signed-off-by: Mike Manning --- Documentation/networking/vrf.txt | 9 + include/net/inet6_hashtables.h | 5 ++--- include/net/inet_hashtables.h| 13 ++--- include/net/inet_sock.h | 13 + net/ipv4/inet_connection_sock.c | 13 ++--- net/ipv4/inet_hashtables.c | 20 +++- 6 files changed, 51 insertions(+), 22 deletions(-) diff --git a/Documentation/networking/vrf.txt b/Documentation/networking/vrf.txt index 8ff7b4c8f91b..d4b129402d57 100644 --- a/Documentation/networking/vrf.txt +++ b/Documentation/networking/vrf.txt @@ -103,6 +103,11 @@ VRF device: or to specify the output device using cmsg and IP_PKTINFO. +By default the scope of the port bindings for unbound sockets is +limited to the default VRF. That is, it will not be matched by packets +arriving on interfaces enslaved to an l3mdev and processes may bind to +the same port if they bind to an l3mdev. + TCP & UDP services running in the default VRF context (ie., not bound to any VRF device) can work across all VRF domains by enabling the tcp_l3mdev_accept and udp_l3mdev_accept sysctl options: @@ -112,10 +117,6 @@ tcp_l3mdev_accept and udp_l3mdev_accept sysctl options: netfilter rules on the VRF device can be used to limit access to services running in the default VRF context as well. -The default VRF does not have limited scope with respect to port bindings. -That is, if a process does a wildcard bind to a port in the default VRF it -owns the port across all VRF domains within the network namespace. - Using iproute2 for VRFs diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 6e91e38a31da..9db98af46985 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -115,9 +115,8 @@ int inet6_hash(struct sock *sk); ((__sk)->sk_family == AF_INET6)&& \ ipv6_addr_equal(&(__sk)->sk_v6_daddr, (__saddr)) && \ ipv6_addr_equal(&(__sk)->sk_v6_rcv_saddr, (__daddr)) && \ -(!(__sk)->sk_bound_dev_if || \ - ((__sk)->sk_bound_dev_if == (__dif)) || \ - ((__sk)->sk_bound_dev_if == (__sdif))) && \ +(((__sk)->sk_bound_dev_if == (__dif)) || \ + ((__sk)->sk_bound_dev_if == (__sdif)))&& \ net_eq(sock_net(__sk), (__net))) #endif /* _INET6_HASHTABLES_H */ diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 9141e95529e7..4ae060b4bac2 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -79,6 +79,7 @@ struct inet_ehash_bucket { struct inet_bind_bucket { possible_net_t ib_net; + int l3mdev; unsigned short port; signed char fastreuse; signed char fastreuseport; @@ -191,7 +192,7 @@ static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) struct inet_bind_bucket * inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, - const unsigned short snum); + const unsigned short snum, int l3mdev); void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb); @@ -282,9 +283,8 @@ static inline struct sock *inet_lookup_listener(struct net *net, #define INET_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif, __sdif) \ (((__sk)->sk_portpair == (__ports)) && \ ((__sk)->sk_addrpair == (__cookie))&& \ -(!(__sk)->sk_bound_dev_if || \ - ((__sk)->sk_bound_dev_if == (__dif)) || \ - ((__sk)->sk_bound_dev_if == (__sdif))) && \ +(((__sk)->sk_bound_dev_if == (__dif)) || \ + ((__sk)->sk_bound_dev_if == (__sdif)))&& \
[PATCH net-next v3 8/9] ipv6: handling of multicast packets received in VRF
If the skb for multicast packets marked as enslaved to a VRF are received, then the secondary device index should be used to obtain the real device. And verify the multicast address against the enslaved rather than the l3mdev device. Signed-off-by: Dewi Morgan Signed-off-by: Mike Manning --- net/ipv6/ip6_input.c | 35 --- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 96577e742afd..df58e1100226 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -359,6 +359,8 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk } } else if (ipprot->flags & INET6_PROTO_FINAL) { const struct ipv6hdr *hdr; + int sdif = inet6_sdif(skb); + struct net_device *dev; /* Only do this once for first final protocol */ have_final = true; @@ -371,9 +373,19 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); hdr = ipv6_hdr(skb); + + /* skb->dev passed may be master dev for vrfs. */ + if (sdif) { + dev = dev_get_by_index_rcu(net, sdif); + if (!dev) + goto discard; + } else { + dev = skb->dev; + } + if (ipv6_addr_is_multicast(>daddr) && - !ipv6_chk_mcast_addr(skb->dev, >daddr, - >saddr) && + !ipv6_chk_mcast_addr(dev, >daddr, +>saddr) && !ipv6_is_mld(skb, nexthdr, skb_network_header_len(skb))) goto discard; } @@ -432,15 +444,32 @@ EXPORT_SYMBOL_GPL(ip6_input); int ip6_mc_input(struct sk_buff *skb) { + int sdif = inet6_sdif(skb); const struct ipv6hdr *hdr; + struct net_device *dev; bool deliver; __IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev), __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST, skb->len); + /* skb->dev passed may be master dev for vrfs. */ + if (sdif) { + rcu_read_lock(); + dev = dev_get_by_index_rcu(dev_net(skb->dev), sdif); + if (!dev) { + rcu_read_unlock(); + kfree_skb(skb); + return -ENODEV; + } + } else { + dev = skb->dev; + } + hdr = ipv6_hdr(skb); - deliver = ipv6_chk_mcast_addr(skb->dev, >daddr, NULL); + deliver = ipv6_chk_mcast_addr(dev, >daddr, NULL); + if (sdif) + rcu_read_unlock(); #ifdef CONFIG_IPV6_MROUTE /* -- 2.11.0
[PATCH net-next v3 0/9] vrf: allow simultaneous service instances in default and other VRFs
Services currently have to be VRF-aware if they are using an unbound socket. One cannot have multiple service instances running in the default and other VRFs for services that are not VRF-aware and listen on an unbound socket. This is because there is no way of isolating packets received in the default VRF from those arriving in other VRFs. This series provides this isolation subject to the existing kernel parameter net.ipv4.tcp_l3mdev_accept not being set, given that this is documented as allowing a single service instance to work across all VRF domains. The functionality applies to UDP & TCP services, for IPv4 and IPv6, in particular adding VRF table handling for IPv6 multicast. Example of running ssh instances in default and blue VRF: $ /usr/sbin/sshd -D $ ip vrf exec vrf-blue /usr/sbin/sshd $ ss -ta | egrep 'State|ssh' State Recv-Q Send-Q Local Address:Port Peer Address:Port LISTEN 0128 0.0.0.0%vrf-blue:ssh 0.0.0.0:* LISTEN 01280.0.0.0:ssh 0.0.0.0:* ESTAB 00 192.168.122.220:ssh 192.168.122.1:50282 LISTEN 0128 [::]%vrf-blue:ssh[::]:* LISTEN 0128 [::]:ssh[::]:* ESTAB 00 [3000::2]%vrf-blue:ssh [3000::9]:45896 ESTAB 00[2000::2]:ssh [2000::9]:46398 v1: - Address Paolo Abeni's comments (patch 4/5) - Fix build when CONFIG_NET_L3_MASTER_DEV not defined (patch 1/5) v2: - Address David Aherns' comments (patches 4/5 and 5/5) - Remove patches 3/5 and 5/5 from series for individual submissions - Include a sysctl for raw sockets as recommended by David Ahern - Expand series into 10 patches and provide improved descriptions v3: - Update description for patch 1/10 and remove patch 6/10 Dewi Morgan (1): ipv6: do not drop vrf udp multicast packets Duncan Eastoe (1): net: fix raw socket lookup device bind matching with VRFs Mike Manning (6): net: ensure unbound stream socket to be chosen when not in a VRF net: ensure unbound datagram socket to be chosen when not in a VRF net: provide a sysctl raw_l3mdev_accept for raw socket lookup with VRFs vrf: mark skb for multicast or link-local as enslaved to VRF ipv6: allow ping to link-local address in VRF ipv6: handling of multicast packets received in VRF Robert Shearman (1): net: allow binding socket in a VRF when there's an unbound socket Documentation/networking/ip-sysctl.txt | 9 + Documentation/networking/vrf.txt | 17 ++--- drivers/net/vrf.c | 19 +- include/net/inet6_hashtables.h | 5 ++--- include/net/inet_hashtables.h | 24 --- include/net/inet_sock.h| 21 include/net/netns/ipv4.h | 3 +++ include/net/raw.h | 12 include/net/udp.h | 11 +++ net/core/sock.c| 2 ++ net/ipv4/inet_connection_sock.c| 13 ++--- net/ipv4/inet_hashtables.c | 34 - net/ipv4/raw.c | 3 +-- net/ipv4/sysctl_net_ipv4.c | 11 +++ net/ipv4/udp.c | 15 ++- net/ipv6/datagram.c| 5 - net/ipv6/inet6_hashtables.c| 14 ++ net/ipv6/ip6_input.c | 35 +++--- net/ipv6/ipv6_sockglue.c | 2 +- net/ipv6/raw.c | 5 ++--- net/ipv6/udp.c | 22 ++--- 21 files changed, 200 insertions(+), 82 deletions(-) -- 2.11.0
[PATCH net-next v3 7/9] ipv6: allow ping to link-local address in VRF
If link-local packets are marked as enslaved to a VRF, then to allow ping to the link-local from a vrf, the error handling for IPV6_PKTINFO needs to be relaxed to also allow the pkt ipi6_ifindex to be that of a slave device to the vrf. Note that the real device also needs to be retrieved in icmp6_iif() to set the ipv6 flow oif to this for icmp echo reply handling. The recent commit 24b711edfc34 ("net/ipv6: Fix linklocal to global address with VRF") takes care of this, so the sdif does not need checking here. This fix makes ping to link-local consistent with that to global addresses, in that this can now be done from within the same VRF that the address is in. Signed-off-by: Mike Manning --- net/ipv6/ipv6_sockglue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index c0cac9cc3a28..f3e99e578843 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -486,7 +486,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, retv = -EFAULT; break; } - if (sk->sk_bound_dev_if && pkt.ipi6_ifindex != sk->sk_bound_dev_if) + if (!sk_dev_equal_l3scope(sk, pkt.ipi6_ifindex)) goto e_inval; np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex; -- 2.11.0
[PATCH net-next v3 6/9] vrf: mark skb for multicast or link-local as enslaved to VRF
The skb for packets that are multicast or to a link-local address are not marked as being enslaved to a VRF, if they are received on a socket bound to the VRF. This is needed for ND and it is preferable for the kernel not to have to deal with the additional use-cases if ll or mcast packets are handled as enslaved. However, this does not allow service instances listening on unbound and bound to VRF sockets to distinguish the VRF used, if packets are sent as multicast or to a link-local address. The fix is for the VRF driver to also mark these skb as being enslaved to the VRF. Signed-off-by: Mike Manning --- drivers/net/vrf.c | 19 +-- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 69b7227c637e..21ad4b1d7f03 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -981,24 +981,23 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { int orig_iif = skb->skb_iif; - bool need_strict; + bool need_strict = rt6_need_strict(_hdr(skb)->daddr); + bool is_ndisc = ipv6_ndisc_frame(skb); - /* loopback traffic; do not push through packet taps again. -* Reset pkt_type for upper layers to process skb + /* loopback, multicast & non-ND link-local traffic; do not push through +* packet taps again. Reset pkt_type for upper layers to process skb */ - if (skb->pkt_type == PACKET_LOOPBACK) { + if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) { skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; IP6CB(skb)->flags |= IP6SKB_L3SLAVE; - skb->pkt_type = PACKET_HOST; + if (skb->pkt_type == PACKET_LOOPBACK) + skb->pkt_type = PACKET_HOST; goto out; } - /* if packet is NDISC or addressed to multicast or link-local -* then keep the ingress interface -*/ - need_strict = rt6_need_strict(_hdr(skb)->daddr); - if (!ipv6_ndisc_frame(skb) && !need_strict) { + /* if packet is NDISC then keep the ingress interface */ + if (!is_ndisc) { vrf_rx_stats(vrf_dev, skb->len); skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; -- 2.11.0
Re: [PATCH net] ipv6: revert degradation in IPv6 Ready Logo test results
On 02/10/2018 19:26, David Miller wrote: > From: Mike Manning > Date: Tue, 2 Oct 2018 12:40:30 +0100 > >> This reverts commit 0ed4229b08c1 ("ipv6: defrag: drop non-last frags >> smaller than min mtu"). While one should not get fragments smaller than >> the IPv6 minimum MTU, not handling crafted packets in the TAHI IPv6 >> conformance test suite (v6eval) for IPv6 Ready Logo results in 18 >> failures representing over 5% of the score. >> >> Cc: Florian Westphal >> Signed-off-by: Mike Manning > Sorry, I'm not just going to blindly apply a patch because some > TAHI tests fail. > > It's possible the TAHI tests are wrong, or that the specification > elements it is testing don't make any sense these days. > > Allowing all kinds of random junk in the middle of the fragment queue > leads to lots of unnecessary cpu overhead and potential bugs, and it > triggerable remotely. Understood, thank you. It would be great if there is someone on this mailer who has influence with ipv6ready.org so as to get the TAHI tests for IPv6 conformance updated, as an upgrade to a kernel with the commit mentioned will result in a 5% degradation in results for the existing tests.
[PATCH net] ipv6: revert degradation in IPv6 Ready Logo test results
This reverts commit 0ed4229b08c1 ("ipv6: defrag: drop non-last frags smaller than min mtu"). While one should not get fragments smaller than the IPv6 minimum MTU, not handling crafted packets in the TAHI IPv6 conformance test suite (v6eval) for IPv6 Ready Logo results in 18 failures representing over 5% of the score. Cc: Florian Westphal Signed-off-by: Mike Manning --- The failures which are reverted by this fix are: Section 1: RFC 2460 - IPv6 Specification Test v6LC.1.2.4: Extension Header Processing Order 33-34 Test v6LC.1.3.1: Fragment Reassembly 67-72 Test v6LC.1.3.2: Reassembly Time Exceeded 73-76 Test v6LC.1.3.3: Fragment Header M-Bit Set, Payload Length Invalid 78 Section 5: RFC 4443 - ICMPv6 Test v6LC.5.1.6: Erroneous Header Field (Parameter Problem Generation) 20 Erroneous Header Field (Parameter Problem Generation) Test v6LC.5.1.10: Error Condition With Multicast Destination 31 Part B: Echo Request Reassembly Timeout Test v6LC.5.1.11: Error Condition With Non-Unique Source - Unspecified 35 Part C: Echo Request Reassembly Timeout (Routers and Hosts) Test v6LC.5.1.12: Error Condition With Non-Unique Source - Multicast 40 Part C: Echo Request Reassembly Timeout (Routers and Hosts) Test v6LC.5.1.13: Error Condition With Non-Unique Source Anycast (Routers Only) 45 Part C: Echo Request Reassembly Timeout net/ipv6/netfilter/nf_conntrack_reasm.c | 4 net/ipv6/reassembly.c | 4 2 files changed, 8 deletions(-) diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 8f68a518d9db..8c69c4fc78d8 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -559,10 +559,6 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) hdr = ipv6_hdr(skb); fhdr = (struct frag_hdr *)skb_transport_header(skb); - if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU && - fhdr->frag_off & htons(IP6_MF)) - return -EINVAL; - skb_orphan(skb); fq = fq_find(net, fhdr->identification, user, hdr, skb->dev ? skb->dev->ifindex : 0); diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 5c5b4f79296e..b4e558ab39fa 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -456,10 +456,6 @@ static int ipv6_frag_rcv(struct sk_buff *skb) return 1; } - if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU && - fhdr->frag_off & htons(IP6_MF)) - goto fail_hdr; - iif = skb->dev ? skb->dev->ifindex : 0; fq = fq_find(net, fhdr->identification, hdr, iif); if (fq) { -- 2.11.0
Re: [PATCH net-next v1 1/5] net: allow binding socket in a VRF when there's an unbound socket
On 25/09/2018 18:16, David Ahern wrote: > On 9/25/18 9:26 AM, Mike Manning wrote: >> On 24/09/2018 23:44, David Ahern wrote: >>> On 9/24/18 10:13 AM, Mike Manning wrote: >>>> From: Robert Shearman >>>> >>>> There is no easy way currently for applications that want to receive >>>> packets in the default VRF to be isolated from packets arriving in >>>> VRFs, which makes using VRF-unaware applications in a VRF-aware system >>>> a potential security risk. >>> That comment is not correct. >>> >>> The point of the l3mdev sysctl's is to prohibit this case. Setting >>> net.ipv4.{tcp,udp}_l3mdev_accept=0 means that a packet arriving on an >>> interface enslaved to a VRF can not be received by a global socket. >> Hi David, thanks for reviewing this. The converse does not hold though, >> i.e. there is no guarantee that the unbound socket will be selected for >> packets when not in a VRF, if there is an unbound socket and a socket >> bound to a VRF. Also, such packets should not be handled by the socket > I need an explicit example here. You are saying a packet arriving on an > interface not enslaved to a VRF might match a socket bound to a VRF? This problem occurs when different service instances are listening on an unbound socket and sockets bound to VRFs respectively. Received packets that are not in a VRF are not guaranteed to be handled by the unbound socket. >> in the VRF if there is no unbound socket. We also had an issue with raw >> socket lookup device bind matching. I can break this particular patch >> into smaller patches and provide more detail, would this help? I will >> also update/break up the other patches according to your comments. > Why not add an l3mdev sysctl for raw sockets then? I have now added this, see patch 4/10. > Yes, please send smaller patches. A diff stat of: > 15 files changed, 109 insertions(+), 62 deletions(-) > is a bit harsh. I have removed the 2 patches you are ok with and have submitted them separately , and have split the remaining 3 into 10 smaller patches. >>> Setting the l3mdev to 1 allows the default socket to work across VRFs. >>> If that is not what you want for a given app or a given VRF, then one >>> option is to add netfilter rules on the VRF device to prohibit it. I >>> just verified this works for both tcp and udp. >> Netfilter is per application and so does not scale. I have not checked >> if it is suitable for packet handling on raw sockets. >> >>> Further, overlapping binds are allowed using SO_REUSEPORT meaning I can >>> have a server running in the default vrf bound to a port AND a server >>> running bound to a specific vrf and the same port: >>> >>> udp   UNCONN 0 0 *%red:12345 *:* >>>    users:(("vrf-test",pid=1376,fd=3)) >>> udp   UNCONN 0 0  *:12345 *:* >>> users:(("vrf-test",pid=1375,fd=3)) >>> >>> tcp   LISTEN 0 1 *%red:12345 *:* >>>    users:(("vrf-test",pid=1356,fd=3)) >>> tcp   LISTEN 0 1  *:12345 *:* >>> users:(("vrf-test",pid=1352,fd=3)) >>> >>> For packets arriving on an interface enslaved to a VRF the socket lookup >>> will pick the VRF server over the global one. >> Agreed, but the converse is not guaranteed to hold i.e. packets that are >> not in a VRF may be handled by a socket bound to a VRF. >> >> We do use SO_REUSEPORT for our own applications so as to run instances >> in the default and other VRFs, but still require these patches (or >> similar) due to how packets are handled when there is an unbound socket >> and sockets bound to different VRFs. > Why can't compute_score be adjusted to account for that case? Yes, this is what we are doing. This is now in patch 2/10 for stream and 3/10 for datagram sockets, and see 5/10 for raw sockets. >>> -- >>> >>> With this patch set I am seeing a number of tests failing -- socket >>> connections working when they should not or not working when they >>> should. I only skimmed the results. I am guessing this patch is the >>> reason, but that is just a guess. >>> >>> You need to make sure all permutations of: >>> 1. net.ipv4.{tcp,udp}_l3mdev_accept={0,1}, >>> 2. connection in the default VRF and in a VRF, >>> 3. locally originated and remote traffic, >>> 4. ipv4 and ipv6 >>> >> We are using raw, datagram and stream sockets for ipv4 & ipv6
[PATCH net-next v2 03/10] net: ensure unbound datagram socket to be chosen when not in a VRF
Ensure an unbound datagram skt is chosen when not in a VRF. The check for a device match in compute_score() for UDP must be performed when there is no device match. For this, a failure is returned when there is no device match. This ensures that bound sockets are never selected, even if there is no unbound socket. Allow IPv6 packets to be sent over a datagram skt bound to a VRF. These packets are currently blocked, as flowi6_oif was set to that of the master vrf device, and the ipi6_ifindex is that of the slave device. Allow these packets to be sent by checking the device with ipi6_ifindex has the same L3 scope as that of the bound device of the skt, which is the master vrf device. Note that this check always succeeds if the skt is unbound. Even though the right datagram skt is now selected by compute_score(), a different skt is being returned that is bound to the wrong vrf. The difference between these and stream sockets is the handling of the skt option for SO_REUSEPORT. While the handling when adding a skt for reuse correctly checks that the bound device of the skt is a match, the skts in the hashslot are already incorrect. So for the same hash, a skt for the wrong vrf may be selected for the required port. The root cause is that the skt is immediately placed into a slot when it is created, but when the skt is then bound using SO_BINDTODEVICE, it remains in the same slot. The solution is to move the skt to the correct slot by forcing a rehash. Signed-off-by: Mike Manning --- include/net/udp.h | 11 +++ net/core/sock.c | 2 ++ net/ipv4/udp.c | 15 ++- net/ipv6/datagram.c | 5 - net/ipv6/udp.c | 14 +- 5 files changed, 28 insertions(+), 19 deletions(-) diff --git a/include/net/udp.h b/include/net/udp.h index 8482a990b0bb..1e4fb8feaf50 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -252,6 +252,17 @@ static inline int udp_rqueue_get(struct sock *sk) return sk_rmem_alloc_get(sk) - READ_ONCE(udp_sk(sk)->forward_deficit); } +static inline bool udp_sk_bound_dev_eq(struct net *net, int bound_dev_if, + int dif, int sdif) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + return inet_bound_dev_eq(net->ipv4.sysctl_udp_l3mdev_accept, +bound_dev_if, dif, sdif); +#else + return inet_bound_dev_eq(1, bound_dev_if, dif, sdif); +#endif +} + /* net/ipv4/udp.c */ void udp_destruct_sock(struct sock *sk); void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len); diff --git a/net/core/sock.c b/net/core/sock.c index 3730eb855095..da1cbb88a6bf 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -567,6 +567,8 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval, lock_sock(sk); sk->sk_bound_dev_if = index; + if (sk->sk_prot->rehash) + sk->sk_prot->rehash(sk); sk_dst_reset(sk); release_sock(sk); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3386b3b0218c..0559a7f4c83a 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -371,6 +371,7 @@ static int compute_score(struct sock *sk, struct net *net, { int score; struct inet_sock *inet; + bool dev_match; if (!net_eq(sock_net(sk), net) || udp_sk(sk)->udp_port_hash != hnum || @@ -398,15 +399,11 @@ static int compute_score(struct sock *sk, struct net *net, score += 4; } - if (sk->sk_bound_dev_if || exact_dif) { - bool dev_match = (sk->sk_bound_dev_if == dif || - sk->sk_bound_dev_if == sdif); - - if (!dev_match) - return -1; - if (sk->sk_bound_dev_if) - score += 4; - } + dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, + dif, sdif); + if (!dev_match) + return -1; + score += 4; if (sk->sk_incoming_cpu == raw_smp_processor_id()) score++; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 1ede7a16a0be..4813293d4fad 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -782,7 +782,10 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, if (src_info->ipi6_ifindex) { if (fl6->flowi6_oif && - src_info->ipi6_ifindex != fl6->flowi6_oif) + src_info->ipi6_ifindex != fl6->flowi6_oif && + (sk->sk_bound_dev_if != fl6->flowi6_oif || +!sk_dev_equal_l3scope( +sk, src_info->ipi6_ifindex))) return -EINVAL; fl6->flowi6_oif = src_info->
[PATCH net-next v2 01/10] net: allow binding socket in a VRF when there's an unbound socket
From: Robert Shearman There is no easy way currently for applications that want to receive packets in the default VRF to be isolated from packets arriving in VRFs, which makes using VRF-unaware applications in a VRF-aware system a potential security risk. So change the inet socket lookup to avoid packets arriving on a device enslaved to an l3mdev from matching unbound sockets by removing the wildcard for non sk_bound_dev_if and instead relying on check against the secondary device index, which will be 0 when the input device is not enslaved to an l3mdev and so match against an unbound socket and not match when the input device is enslaved. Change the socket binding to take the l3mdev into account to allow an unbound socket to not conflict sockets bound to an l3mdev given the datapath isolation now guaranteed. Signed-off-by: Robert Shearman Signed-off-by: Mike Manning --- Documentation/networking/vrf.txt | 9 + include/net/inet6_hashtables.h | 5 ++--- include/net/inet_hashtables.h| 13 ++--- include/net/inet_sock.h | 13 + net/ipv4/inet_connection_sock.c | 13 ++--- net/ipv4/inet_hashtables.c | 20 +++- 6 files changed, 51 insertions(+), 22 deletions(-) diff --git a/Documentation/networking/vrf.txt b/Documentation/networking/vrf.txt index 8ff7b4c8f91b..d4b129402d57 100644 --- a/Documentation/networking/vrf.txt +++ b/Documentation/networking/vrf.txt @@ -103,6 +103,11 @@ VRF device: or to specify the output device using cmsg and IP_PKTINFO. +By default the scope of the port bindings for unbound sockets is +limited to the default VRF. That is, it will not be matched by packets +arriving on interfaces enslaved to an l3mdev and processes may bind to +the same port if they bind to an l3mdev. + TCP & UDP services running in the default VRF context (ie., not bound to any VRF device) can work across all VRF domains by enabling the tcp_l3mdev_accept and udp_l3mdev_accept sysctl options: @@ -112,10 +117,6 @@ tcp_l3mdev_accept and udp_l3mdev_accept sysctl options: netfilter rules on the VRF device can be used to limit access to services running in the default VRF context as well. -The default VRF does not have limited scope with respect to port bindings. -That is, if a process does a wildcard bind to a port in the default VRF it -owns the port across all VRF domains within the network namespace. - Using iproute2 for VRFs diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 6e91e38a31da..9db98af46985 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -115,9 +115,8 @@ int inet6_hash(struct sock *sk); ((__sk)->sk_family == AF_INET6)&& \ ipv6_addr_equal(&(__sk)->sk_v6_daddr, (__saddr)) && \ ipv6_addr_equal(&(__sk)->sk_v6_rcv_saddr, (__daddr)) && \ -(!(__sk)->sk_bound_dev_if || \ - ((__sk)->sk_bound_dev_if == (__dif)) || \ - ((__sk)->sk_bound_dev_if == (__sdif))) && \ +(((__sk)->sk_bound_dev_if == (__dif)) || \ + ((__sk)->sk_bound_dev_if == (__sdif)))&& \ net_eq(sock_net(__sk), (__net))) #endif /* _INET6_HASHTABLES_H */ diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 9141e95529e7..4ae060b4bac2 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -79,6 +79,7 @@ struct inet_ehash_bucket { struct inet_bind_bucket { possible_net_t ib_net; + int l3mdev; unsigned short port; signed char fastreuse; signed char fastreuseport; @@ -191,7 +192,7 @@ static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) struct inet_bind_bucket * inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, - const unsigned short snum); + const unsigned short snum, int l3mdev); void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb); @@ -282,9 +283,8 @@ static inline struct sock *inet_lookup_listener(struct net *net, #define INET_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif, __sdif) \ (((__sk)->sk_portpair == (__ports)) && \ ((__sk)->sk_addrpair == (__cookie))&& \ -(!(__sk)->sk_bound_dev_if || \ - ((__sk)->sk_bound_dev_if == (__dif)) || \ -
[PATCH net-next v2 08/10] ipv6: allow ping to link-local address in VRF
If link-local packets are marked as enslaved to a VRF, then to allow ping to the link-local from a vrf, the error handling for IPV6_PKTINFO needs to be relaxed to also allow the pkt ipi6_ifindex to be that of a slave device to the vrf. Note that the real device also needs to be retrieved in icmp6_iif() to set the ipv6 flow oif to this for icmp echo reply handling. The recent commit 24b711edfc34 ("net/ipv6: Fix linklocal to global address with VRF") takes care of this, so the sdif does not need checking here. This fix makes ping to link-local consistent with that to global addresses, in that this can now be done from within the same VRF that the address is in. Signed-off-by: Mike Manning --- net/ipv6/ipv6_sockglue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 7dfbc797b130..4ebd395dd3df 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -486,7 +486,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, retv = -EFAULT; break; } - if (sk->sk_bound_dev_if && pkt.ipi6_ifindex != sk->sk_bound_dev_if) + if (!sk_dev_equal_l3scope(sk, pkt.ipi6_ifindex)) goto e_inval; np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex; -- 2.11.0
[PATCH net-next v2 07/10] vrf: mark skb for multicast or link-local as enslaved to VRF
The skb for packets that are multicast or to a link-local address are not marked as being enslaved to a VRF, if they are received on a socket bound to the VRF. This is needed for ND and it is preferable for the kernel not to have to deal with the additional use-cases if ll or mcast packets are handled as enslaved. However, this does not allow service instances listening on unbound and bound to VRF sockets to distinguish the VRF used, if packets are sent as multicast or to a link-local address. The fix is for the VRF driver to also mark these skb as being enslaved to the VRF. Signed-off-by: Mike Manning --- drivers/net/vrf.c | 19 +-- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 69b7227c637e..21ad4b1d7f03 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -981,24 +981,23 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { int orig_iif = skb->skb_iif; - bool need_strict; + bool need_strict = rt6_need_strict(_hdr(skb)->daddr); + bool is_ndisc = ipv6_ndisc_frame(skb); - /* loopback traffic; do not push through packet taps again. -* Reset pkt_type for upper layers to process skb + /* loopback, multicast & non-ND link-local traffic; do not push through +* packet taps again. Reset pkt_type for upper layers to process skb */ - if (skb->pkt_type == PACKET_LOOPBACK) { + if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) { skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; IP6CB(skb)->flags |= IP6SKB_L3SLAVE; - skb->pkt_type = PACKET_HOST; + if (skb->pkt_type == PACKET_LOOPBACK) + skb->pkt_type = PACKET_HOST; goto out; } - /* if packet is NDISC or addressed to multicast or link-local -* then keep the ingress interface -*/ - need_strict = rt6_need_strict(_hdr(skb)->daddr); - if (!ipv6_ndisc_frame(skb) && !need_strict) { + /* if packet is NDISC then keep the ingress interface */ + if (!is_ndisc) { vrf_rx_stats(vrf_dev, skb->len); skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; -- 2.11.0
[PATCH net-next v2 04/10] net: provide a sysctl raw_l3mdev_accept for raw socket lookup with VRFs
Add a sysctl raw_l3mdev_accept to control raw socket lookup in a manner similar to use of tcp_l3mdev_accept for stream and of udp_l3mdev_accept for datagram sockets. Have this default to off as this is what users expect, given that there is no explicit mechanism to set unmodified VRF-unaware application into a default VRF. Signed-off-by: Mike Manning --- Documentation/networking/ip-sysctl.txt | 9 + Documentation/networking/vrf.txt | 8 +--- include/net/netns/ipv4.h | 3 +++ net/ipv4/sysctl_net_ipv4.c | 11 +++ 4 files changed, 28 insertions(+), 3 deletions(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 8313a636dd53..a46be4a5b7a0 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -788,6 +788,15 @@ udp_wmem_min - INTEGER total pages of UDP sockets exceed udp_mem pressure. The unit is byte. Default: 4K +RAW variables: + +raw_l3mdev_accept - BOOLEAN + Enabling this option allows a "global" bound socket to work + across L3 master domains (e.g., VRFs) with packets capable of + being received regardless of the L3 domain in which they + originated. Only valid when the kernel was compiled with + CONFIG_NET_L3_MASTER_DEV. + CIPSOv4 Variables: cipso_cache_enable - BOOLEAN diff --git a/Documentation/networking/vrf.txt b/Documentation/networking/vrf.txt index d4b129402d57..deb798342f1e 100644 --- a/Documentation/networking/vrf.txt +++ b/Documentation/networking/vrf.txt @@ -108,11 +108,13 @@ limited to the default VRF. That is, it will not be matched by packets arriving on interfaces enslaved to an l3mdev and processes may bind to the same port if they bind to an l3mdev. -TCP & UDP services running in the default VRF context (ie., not bound -to any VRF device) can work across all VRF domains by enabling the -tcp_l3mdev_accept and udp_l3mdev_accept sysctl options: +TCP & UDP services & services using RAW sockets that are running in the +default VRF context (ie., not bound to any VRF device) can work across +all VRF domains by enabling the tcp_l3mdev_accept, udp_l3mdev_accept and +raw_l3mdev_accept sysctl options: sysctl -w net.ipv4.tcp_l3mdev_accept=1 sysctl -w net.ipv4.udp_l3mdev_accept=1 +sysctl -w net.ipv4.raw_l3mdev_accept=1 netfilter rules on the VRF device can be used to limit access to services running in the default VRF context as well. diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index e47503b4e4d1..104a6669e344 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -103,6 +103,9 @@ struct netns_ipv4 { /* Shall we try to damage output packets if routing dev changes? */ int sysctl_ip_dynaddr; int sysctl_ip_early_demux; +#ifdef CONFIG_NET_L3_MASTER_DEV + int sysctl_raw_l3mdev_accept; +#endif int sysctl_tcp_early_demux; int sysctl_udp_early_demux; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index b92f422f2fa8..d173337040ee 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -601,6 +601,17 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = ipv4_ping_group_range, }, +#ifdef CONFIG_NET_L3_MASTER_DEV + { + .procname = "raw_l3mdev_accept", + .data = _net.ipv4.sysctl_raw_l3mdev_accept, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = , + .extra2 = , + }, +#endif { .procname = "tcp_ecn", .data = _net.ipv4.sysctl_tcp_ecn, -- 2.11.0
[PATCH net-next v2 10/10] ipv6: do not drop vrf udp multicast packets
From: Dewi Morgan For bound udp sockets in a vrf, also check the sdif to get the index for ingress devices enslaved to an l3mdev. Signed-off-by: Dewi Morgan Signed-off-by: Mike Manning --- net/ipv6/udp.c | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 6722490c87b9..821fdc31dbc0 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -637,7 +637,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk, __be16 loc_port, const struct in6_addr *loc_addr, __be16 rmt_port, const struct in6_addr *rmt_addr, - int dif, unsigned short hnum) + int dif, int sdif, unsigned short hnum) { struct inet_sock *inet = inet_sk(sk); @@ -649,7 +649,7 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk, (inet->inet_dport && inet->inet_dport != rmt_port) || (!ipv6_addr_any(>sk_v6_daddr) && !ipv6_addr_equal(>sk_v6_daddr, rmt_addr)) || - (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) || + !udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif) || (!ipv6_addr_any(>sk_v6_rcv_saddr) && !ipv6_addr_equal(>sk_v6_rcv_saddr, loc_addr))) return false; @@ -683,6 +683,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, unsigned int offset = offsetof(typeof(*sk), sk_node); unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); int dif = inet6_iif(skb); + int sdif = inet6_sdif(skb); struct hlist_node *node; struct sk_buff *nskb; @@ -697,7 +698,8 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, sk_for_each_entry_offset_rcu(sk, node, >head, offset) { if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr, - uh->source, saddr, dif, hnum)) + uh->source, saddr, dif, sdif, + hnum)) continue; /* If zero checksum and no_check is not on for * the socket then skip it. -- 2.11.0
[PATCH net-next v2 05/10] net: fix raw socket lookup device bind matching with VRFs
From: Duncan Eastoe When there exist a pair of raw sockets one unbound and one bound to a VRF but equal in all other respects, when a packet is received in the VRF context, __raw_v4_lookup() matches on both sockets. This results in the packet being delivered over both sockets, instead of only the raw socket bound to the VRF. The bound device checks in __raw_v4_lookup() are replaced with a call to raw_sk_bound_dev_eq() which correctly handles whether the packet should be delivered over the unbound socket in such cases. In __raw_v6_lookup() the match on the device binding of the socket is similarly updated to use raw_sk_bound_dev_eq() which matches the handling in __raw_v4_lookup(). Importantly raw_sk_bound_dev_eq() takes the raw_l3mdev_accept sysctl into account. Signed-off-by: Duncan Eastoe Signed-off-by: Mike Manning --- include/net/raw.h | 12 net/ipv4/raw.c| 4 ++-- net/ipv6/raw.c| 6 +++--- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/include/net/raw.h b/include/net/raw.h index 9c9fa98a91a4..ce88fdd68933 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -18,6 +18,7 @@ #define _RAW_H +#include #include #include @@ -74,4 +75,15 @@ static inline struct raw_sock *raw_sk(const struct sock *sk) return (struct raw_sock *)sk; } +static inline bool raw_sk_bound_dev_eq(struct net *net, int bound_dev_if, + int dif, int sdif) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + return inet_bound_dev_eq(net->ipv4.sysctl_raw_l3mdev_accept, +bound_dev_if, dif, sdif); +#else + return inet_bound_dev_eq(1, bound_dev_if, dif, sdif); +#endif +} + #endif /* _RAW_H */ diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 8ca3eb06ba04..6d8006c86dc0 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -70,6 +70,7 @@ #include #include #include +#include #include #include #include @@ -131,8 +132,7 @@ struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, if (net_eq(sock_net(sk), net) && inet->inet_num == num && !(inet->inet_daddr && inet->inet_daddr != raddr)&& !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && - !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif && - sk->sk_bound_dev_if != sdif)) + raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) goto found; /* gotcha */ } sk = NULL; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 413d98bf24f4..5b363d315b06 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #if IS_ENABLED(CONFIG_IPV6_MIP6) #include @@ -86,9 +87,8 @@ struct sock *__raw_v6_lookup(struct net *net, struct sock *sk, !ipv6_addr_equal(>sk_v6_daddr, rmt_addr)) continue; - if (sk->sk_bound_dev_if && - sk->sk_bound_dev_if != dif && - sk->sk_bound_dev_if != sdif) + if (!raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, +dif, sdif)) continue; if (!ipv6_addr_any(>sk_v6_rcv_saddr)) { -- 2.11.0
[PATCH net-next v2 06/10] net: IP[V6]_MULTICAST_IF constraint on unbound socket if VRFs present
From: Duncan Eastoe If setsockopt(IP_MULTICAST_IF) or setsockopt(IPV6_MULTICAST_IF) is called on a socket which is not bound to a VRF then we should ensure that the output device chosen is also not bound to a VRF master. This avoids inadvertently sending traffic out of the wrong interface. This can be particularly problematic for IP_MULTICAST_IF since the interface lookup can be performed by address as well as ifindex. If there are interfaces with the same address, one unbound and one bound to a VRF, then the interface bound to the VRF may be chosen when the sockopt is called on an unbound socket. Signed-off-by: Duncan Eastoe Signed-off-by: Mike Manning --- net/ipv4/ip_sockglue.c | 3 +++ net/ipv6/ipv6_sockglue.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index c0fe5ad996f2..026971314c43 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -892,6 +892,9 @@ static int do_ip_setsockopt(struct sock *sk, int level, dev_put(dev); err = -EINVAL; + if (!sk->sk_bound_dev_if && midx) + break; + if (sk->sk_bound_dev_if && mreq.imr_ifindex != sk->sk_bound_dev_if && (!midx || midx != sk->sk_bound_dev_if)) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index c0cac9cc3a28..7dfbc797b130 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -626,6 +626,9 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, rcu_read_unlock(); + if (!sk->sk_bound_dev_if && midx) + goto e_inval; + if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != val && (!midx || midx != sk->sk_bound_dev_if)) -- 2.11.0
[PATCH net-next v2 09/10] ipv6: handling of multicast packets received in VRF
If the skb for multicast packets marked as enslaved to a VRF are received, then the secondary device index should be used to obtain the real device. And verify the multicast address against the enslaved rather than the l3mdev device. Signed-off-by: Dewi Morgan Signed-off-by: Mike Manning --- net/ipv6/ip6_input.c | 35 --- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 96577e742afd..df58e1100226 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -359,6 +359,8 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk } } else if (ipprot->flags & INET6_PROTO_FINAL) { const struct ipv6hdr *hdr; + int sdif = inet6_sdif(skb); + struct net_device *dev; /* Only do this once for first final protocol */ have_final = true; @@ -371,9 +373,19 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); hdr = ipv6_hdr(skb); + + /* skb->dev passed may be master dev for vrfs. */ + if (sdif) { + dev = dev_get_by_index_rcu(net, sdif); + if (!dev) + goto discard; + } else { + dev = skb->dev; + } + if (ipv6_addr_is_multicast(>daddr) && - !ipv6_chk_mcast_addr(skb->dev, >daddr, - >saddr) && + !ipv6_chk_mcast_addr(dev, >daddr, +>saddr) && !ipv6_is_mld(skb, nexthdr, skb_network_header_len(skb))) goto discard; } @@ -432,15 +444,32 @@ EXPORT_SYMBOL_GPL(ip6_input); int ip6_mc_input(struct sk_buff *skb) { + int sdif = inet6_sdif(skb); const struct ipv6hdr *hdr; + struct net_device *dev; bool deliver; __IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev), __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST, skb->len); + /* skb->dev passed may be master dev for vrfs. */ + if (sdif) { + rcu_read_lock(); + dev = dev_get_by_index_rcu(dev_net(skb->dev), sdif); + if (!dev) { + rcu_read_unlock(); + kfree_skb(skb); + return -ENODEV; + } + } else { + dev = skb->dev; + } + hdr = ipv6_hdr(skb); - deliver = ipv6_chk_mcast_addr(skb->dev, >daddr, NULL); + deliver = ipv6_chk_mcast_addr(dev, >daddr, NULL); + if (sdif) + rcu_read_unlock(); #ifdef CONFIG_IPV6_MROUTE /* -- 2.11.0
[PATCH net-next v2 00/10] vrf: allow simultaneous service instances in default and other VRFs
Services currently have to be VRF-aware if they are using an unbound socket. One cannot have multiple service instances running in the default and other VRFs for services that are not VRF-aware and listen on an unbound socket. This is because there is no way of isolating packets received in the default VRF from those arriving in other VRFs. This series provides this isolation subject to the existing kernel parameter net.ipv4.tcp_l3mdev_accept not being set, given that this is documented as allowing a single service instance to work across all VRF domains. The functionality applies to UDP & TCP services, for IPv4 and IPv6, in particular adding VRF table handling for IPv6 multicast. Example of running ssh instances in default and blue VRF: $ /usr/sbin/sshd -D $ ip vrf exec vrf-blue /usr/sbin/sshd $ ss -ta | egrep 'State|ssh' State Recv-Q Send-Q Local Address:Port Peer Address:Port LISTEN 0128 0.0.0.0%vrf-blue:ssh 0.0.0.0:* LISTEN 01280.0.0.0:ssh 0.0.0.0:* ESTAB 00 192.168.122.220:ssh 192.168.122.1:50282 LISTEN 0128 [::]%vrf-blue:ssh[::]:* LISTEN 0128 [::]:ssh[::]:* ESTAB 00 [3000::2]%vrf-blue:ssh [3000::9]:45896 ESTAB 00[2000::2]:ssh [2000::9]:46398 v1: - Address Paolo Abeni's comments (patch 4/5) - Fix build when CONFIG_NET_L3_MASTER_DEV not defined (patch 1/5) v2: - Address David Aherns' comments (patches 4/5 and 5/5) - Remove patches 3/5 and 5/5 from series for individual submissions - Include a sysctl for raw sockets as recommended by David Ahern - Expand series into 10 patches and provide improved descriptions Dewi Morgan (1): ipv6: do not drop vrf udp multicast packets Duncan Eastoe (2): net: fix raw socket lookup device bind matching with VRFs net: IP[V6]_MULTICAST_IF constraint on unbound socket if VRFs present Mike Manning (6): net: ensure unbound stream socket to be chosen when not in a VRF net: ensure unbound datagram socket to be chosen when not in a VRF net: provide a sysctl raw_l3mdev_accept for raw socket lookup with VRFs vrf: mark skb for multicast or link-local as enslaved to VRF ipv6: allow ping to link-local address in VRF ipv6: handling of multicast packets received in VRF Robert Shearman (1): net: allow binding socket in a VRF when there's an unbound socket Documentation/networking/ip-sysctl.txt | 9 + Documentation/networking/vrf.txt | 17 ++--- drivers/net/vrf.c | 19 +- include/net/inet6_hashtables.h | 5 ++--- include/net/inet_hashtables.h | 24 --- include/net/inet_sock.h| 21 include/net/netns/ipv4.h | 3 +++ include/net/raw.h | 12 include/net/udp.h | 11 +++ net/core/sock.c| 2 ++ net/ipv4/inet_connection_sock.c| 13 ++--- net/ipv4/inet_hashtables.c | 34 - net/ipv4/ip_sockglue.c | 3 +++ net/ipv4/raw.c | 4 ++-- net/ipv4/sysctl_net_ipv4.c | 11 +++ net/ipv4/udp.c | 15 ++- net/ipv6/datagram.c| 5 - net/ipv6/inet6_hashtables.c| 14 ++ net/ipv6/ip6_input.c | 35 +++--- net/ipv6/ipv6_sockglue.c | 5 - net/ipv6/raw.c | 6 +++--- net/ipv6/udp.c | 22 ++--- 22 files changed, 208 insertions(+), 82 deletions(-) -- 2.11.0
[PATCH net-next v2 02/10] net: ensure unbound stream socket to be chosen when not in a VRF
The commit a04a480d4392 ("net: Require exact match for TCP socket lookups if dif is l3mdev") only ensures that the correct socket is selected for packets in a VRF. However, there is no guarantee that the unbound socket will be selected for packets when not in a VRF. By checking for a device match in compute_score() also for the case when there is no bound device and attaching a score to this, the unbound socket is selected. And if a failure is returned when there is no device match, this ensures that bound sockets are never selected, even if there is no unbound socket. Signed-off-by: Mike Manning --- include/net/inet_hashtables.h | 11 +++ include/net/inet_sock.h | 8 net/ipv4/inet_hashtables.c| 14 ++ net/ipv6/inet6_hashtables.c | 14 ++ 4 files changed, 31 insertions(+), 16 deletions(-) diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 4ae060b4bac2..5de2d9f24c05 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -189,6 +189,17 @@ static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) hashinfo->ehash_locks = NULL; } +static inline bool inet_sk_bound_dev_eq(struct net *net, int bound_dev_if, + int dif, int sdif) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + return inet_bound_dev_eq(net->ipv4.sysctl_tcp_l3mdev_accept, + bound_dev_if, dif, sdif); +#else + return inet_bound_dev_eq(1, bound_dev_if, dif, sdif); +#endif +} + struct inet_bind_bucket * inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 92e0aa3958f6..47c03ea989ad 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -143,6 +143,14 @@ static inline int inet_sk_bound_l3mdev(const struct sock *sk) return 0; } +static inline bool inet_bound_dev_eq(bool l3mdev_accept, int bound_dev_if, +int dif, int sdif) +{ + if (!bound_dev_if) + return !sdif || l3mdev_accept; + return bound_dev_if == dif || bound_dev_if == sdif; +} + static inline struct ip_options_rcu *ireq_opt_deref(const struct inet_request_sock *ireq) { return rcu_dereference_check(ireq->ireq_opt, diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 260531dc6458..2ec684057ebd 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -235,6 +235,7 @@ static inline int compute_score(struct sock *sk, struct net *net, { int score = -1; struct inet_sock *inet = inet_sk(sk); + bool dev_match; if (net_eq(sock_net(sk), net) && inet->inet_num == hnum && !ipv6_only_sock(sk)) { @@ -245,15 +246,12 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score += 4; } - if (sk->sk_bound_dev_if || exact_dif) { - bool dev_match = (sk->sk_bound_dev_if == dif || - sk->sk_bound_dev_if == sdif); + dev_match = inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, +dif, sdif); + if (!dev_match) + return -1; + score += 4; - if (!dev_match) - return -1; - if (sk->sk_bound_dev_if) - score += 4; - } if (sk->sk_incoming_cpu == raw_smp_processor_id()) score++; } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 3d7c7460a0c5..5eeeba7181a1 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -99,6 +99,7 @@ static inline int compute_score(struct sock *sk, struct net *net, const int dif, const int sdif, bool exact_dif) { int score = -1; + bool dev_match; if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum && sk->sk_family == PF_INET6) { @@ -109,15 +110,12 @@ static inline int compute_score(struct sock *sk, struct net *net, return -1; score++; } - if (sk->sk_bound_dev_if || exact_dif) { - bool dev_match = (sk->sk_bound_dev_if == dif || - sk->sk_bound_dev_if == sdif); + dev_match = inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, +dif, sdif); + if (!dev_match) +
[PATCH net-next] ipv6: add vrf table handling code for ipv6 mcast
From: Patrick Ruddy The code to obtain the correct table for the incoming interface was missing for IPv6. This has been added along with the table creation notification to fib rules for the RTNL_FAMILY_IP6MR address family. Signed-off-by: Patrick Ruddy Signed-off-by: Mike Manning --- drivers/net/vrf.c | 11 +++ net/ipv6/ip6mr.c | 48 2 files changed, 47 insertions(+), 12 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index f93547f257fb..69b7227c637e 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1215,8 +1215,19 @@ static int vrf_add_fib_rules(const struct net_device *dev) goto ipmr_err; #endif +#if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES) + err = vrf_fib_rule(dev, RTNL_FAMILY_IP6MR, true); + if (err < 0) + goto ip6mr_err; +#endif + return 0; +#if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES) +ip6mr_err: + vrf_fib_rule(dev, RTNL_FAMILY_IPMR, false); +#endif + #if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES) ipmr_err: vrf_fib_rule(dev, AF_INET6, false); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index d0b7e0249c13..6f07b8380425 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -85,7 +85,8 @@ static struct mr_table *ip6mr_new_table(struct net *net, u32 id); static void ip6mr_free_table(struct mr_table *mrt); static void ip6_mr_forward(struct net *net, struct mr_table *mrt, - struct sk_buff *skb, struct mfc6_cache *cache); + struct net_device *dev, struct sk_buff *skb, + struct mfc6_cache *cache); static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert); static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, @@ -138,6 +139,9 @@ static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, .flags = FIB_LOOKUP_NOREF, }; + /* update flow if oif or iif point to device enslaved to l3mdev */ + l3mdev_update_flow(net, flowi6_to_flowi(flp6)); + err = fib_rules_lookup(net->ipv6.mr6_rules_ops, flowi6_to_flowi(flp6), 0, ); if (err < 0) @@ -164,7 +168,9 @@ static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp, return -EINVAL; } - mrt = ip6mr_get_table(rule->fr_net, rule->table); + arg->table = fib_rule_get_table(rule, arg); + + mrt = ip6mr_get_table(rule->fr_net, arg->table); if (!mrt) return -EAGAIN; res->mrt = mrt; @@ -1014,7 +1020,7 @@ static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt, } rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else - ip6_mr_forward(net, mrt, skb, c); + ip6_mr_forward(net, mrt, skb->dev, skb, c); } } @@ -1120,7 +1126,7 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, /* Queue a packet for resolution. It gets locked cache entry! */ static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, - struct sk_buff *skb) + struct sk_buff *skb, struct net_device *dev) { struct mfc6_cache *c; bool found = false; @@ -1180,6 +1186,10 @@ static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, kfree_skb(skb); err = -ENOBUFS; } else { + if (dev) { + skb->dev = dev; + skb->skb_iif = dev->ifindex; + } skb_queue_tail(>_c.mfc_un.unres.unresolved, skb); err = 0; } @@ -2043,11 +2053,12 @@ static int ip6mr_find_vif(struct mr_table *mrt, struct net_device *dev) } static void ip6_mr_forward(struct net *net, struct mr_table *mrt, - struct sk_buff *skb, struct mfc6_cache *c) + struct net_device *dev, struct sk_buff *skb, + struct mfc6_cache *c) { int psend = -1; int vif, ct; - int true_vifi = ip6mr_find_vif(mrt, skb->dev); + int true_vifi = ip6mr_find_vif(mrt, dev); vif = c->_c.mfc_parent; c->_c.mfc_un.res.pkt++; @@ -2073,7 +2084,7 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, /* * Wrong interface: drop packet and (maybe) send PIM assert. */ - if (mrt->vif_table[vif].dev != skb->dev) { + if (mrt->vif_table[vif].dev != dev) { c->_c.mfc_un.res.wrong_if++; if (true_vifi >= 0 && mrt->mroute_do_assert && @@ -2154,6 +2165,19 @@ int ip6_mr_input(struct sk_buff *skb)
[PATCH net-next] ipv4: Allow sending multicast packets on specific i/f using VRF socket
From: Robert Shearman It is useful to be able to use the same socket for listening in a specific VRF, as for sending multicast packets out of a specific interface. However, the bound device on the socket currently takes precedence and results in the packets not being sent. Relax the condition on overriding the output interface to use for sending packets out of UDP, raw and ping sockets to allow multicast packets to be sent using the specified multicast interface. Signed-off-by: Robert Shearman Signed-off-by: Mike Manning --- net/ipv4/datagram.c | 2 +- net/ipv4/ping.c | 2 +- net/ipv4/raw.c | 2 +- net/ipv4/udp.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index f915abff1350..300921417f89 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -42,7 +42,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len oif = sk->sk_bound_dev_if; saddr = inet->inet_saddr; if (ipv4_is_multicast(usin->sin_addr.s_addr)) { - if (!oif) + if (!oif || netif_index_is_l3_master(sock_net(sk), oif)) oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 8d7aaf118a30..7ccb5f87f70b 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -779,7 +779,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } if (ipv4_is_multicast(daddr)) { - if (!ipc.oif) + if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 33df4d76db2d..8ca3eb06ba04 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -608,7 +608,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) tos |= RTO_ONLINK; if (ipv4_is_multicast(daddr)) { - if (!ipc.oif) + if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index f4e35b2ff8b8..3386b3b0218c 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1042,7 +1042,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } if (ipv4_is_multicast(daddr)) { - if (!ipc.oif) + if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; -- 2.11.0
Re: [PATCH net-next v1 1/5] net: allow binding socket in a VRF when there's an unbound socket
On 24/09/2018 23:44, David Ahern wrote: On 9/24/18 10:13 AM, Mike Manning wrote: From: Robert Shearman There is no easy way currently for applications that want to receive packets in the default VRF to be isolated from packets arriving in VRFs, which makes using VRF-unaware applications in a VRF-aware system a potential security risk. That comment is not correct. The point of the l3mdev sysctl's is to prohibit this case. Setting net.ipv4.{tcp,udp}_l3mdev_accept=0 means that a packet arriving on an interface enslaved to a VRF can not be received by a global socket. Hi David, thanks for reviewing this. The converse does not hold though, i.e. there is no guarantee that the unbound socket will be selected for packets when not in a VRF, if there is an unbound socket and a socket bound to a VRF. Also, such packets should not be handled by the socket in the VRF if there is no unbound socket. We also had an issue with raw socket lookup device bind matching. I can break this particular patch into smaller patches and provide more detail, would this help? I will also update/break up the other patches according to your comments. Setting the l3mdev to 1 allows the default socket to work across VRFs. If that is not what you want for a given app or a given VRF, then one option is to add netfilter rules on the VRF device to prohibit it. I just verified this works for both tcp and udp. Netfilter is per application and so does not scale. I have not checked if it is suitable for packet handling on raw sockets. Further, overlapping binds are allowed using SO_REUSEPORT meaning I can have a server running in the default vrf bound to a port AND a server running bound to a specific vrf and the same port: udpUNCONN 0 0 *%red:12345 *:* users:(("vrf-test",pid=1376,fd=3)) udpUNCONN 0 0 *:12345 *:* users:(("vrf-test",pid=1375,fd=3)) tcpLISTEN 0 1 *%red:12345 *:* users:(("vrf-test",pid=1356,fd=3)) tcpLISTEN 0 1 *:12345 *:* users:(("vrf-test",pid=1352,fd=3)) For packets arriving on an interface enslaved to a VRF the socket lookup will pick the VRF server over the global one. Agreed, but the converse is not guaranteed to hold i.e. packets that are not in a VRF may be handled by a socket bound to a VRF. We do use SO_REUSEPORT for our own applications so as to run instances in the default and other VRFs, but still require these patches (or similar) due to how packets are handled when there is an unbound socket and sockets bound to different VRFs. -- With this patch set I am seeing a number of tests failing -- socket connections working when they should not or not working when they should. I only skimmed the results. I am guessing this patch is the reason, but that is just a guess. You need to make sure all permutations of: 1. net.ipv4.{tcp,udp}_l3mdev_accept={0,1}, 2. connection in the default VRF and in a VRF, 3. locally originated and remote traffic, 4. ipv4 and ipv6 We are using raw, datagram and stream sockets for ipv4 & ipv6, require connectivity for local and remote addresses where appropriate and need route leaking between VRFs when configured, we are unaware of any outstanding bugs. Is there some way that I can run/analyze the tests that are failing for you? Also cf patch 2/5 note that ping to link-local addresses is handled consistently with that to global addresses in a VRF, so this now succeeds if ping is done in the VRF, i.e. 'sudo ip vrf exec ping -I continue to work as expected meaning packets flow when they should and fail with the right error when they should not. I believe the UDP cases were the main ones failing. Given the test failures, I did not look at the code changes in the patch.
[PATCH net-next v1 1/5] net: allow binding socket in a VRF when there's an unbound socket
From: Robert Shearman There is no easy way currently for applications that want to receive packets in the default VRF to be isolated from packets arriving in VRFs, which makes using VRF-unaware applications in a VRF-aware system a potential security risk. So change the inet socket lookup to avoid packets arriving on a device enslaved to an l3mdev from matching unbound sockets by removing the wildcard for non sk_bound_dev_if and instead relying on check against the secondary device index, which will be 0 when the input device is not enslaved to an l3mdev and so match against an unbound socket and not match when the input device is enslaved. The existing net.ipv4.tcp_l3mdev_accept & net.ipv4.udp_l3mdev_accept sysctls, which are documented as allowing the working across all VRF domains, can be used to also work in the default VRF by causing unbound sockets to match against packets arriving on a device enslaved to an l3mdev. Change the socket binding to take the l3mdev into account to allow an unbound socket to not conflict sockets bound to an l3mdev given the datapath isolation now guaranteed. Signed-off-by: Robert Shearman Signed-off-by: Mike Manning --- Documentation/networking/vrf.txt | 9 + include/net/inet6_hashtables.h | 5 ++--- include/net/inet_hashtables.h| 31 --- include/net/inet_sock.h | 13 + net/core/sock.c | 2 ++ net/ipv4/inet_connection_sock.c | 13 ++--- net/ipv4/inet_hashtables.c | 34 +- net/ipv4/ip_sockglue.c | 3 +++ net/ipv4/raw.c | 4 ++-- net/ipv4/udp.c | 15 ++- net/ipv6/datagram.c | 5 - net/ipv6/inet6_hashtables.c | 14 ++ net/ipv6/ipv6_sockglue.c | 3 +++ net/ipv6/raw.c | 6 +++--- net/ipv6/udp.c | 14 +- 15 files changed, 109 insertions(+), 62 deletions(-) diff --git a/Documentation/networking/vrf.txt b/Documentation/networking/vrf.txt index 8ff7b4c8f91b..d4b129402d57 100644 --- a/Documentation/networking/vrf.txt +++ b/Documentation/networking/vrf.txt @@ -103,6 +103,11 @@ VRF device: or to specify the output device using cmsg and IP_PKTINFO. +By default the scope of the port bindings for unbound sockets is +limited to the default VRF. That is, it will not be matched by packets +arriving on interfaces enslaved to an l3mdev and processes may bind to +the same port if they bind to an l3mdev. + TCP & UDP services running in the default VRF context (ie., not bound to any VRF device) can work across all VRF domains by enabling the tcp_l3mdev_accept and udp_l3mdev_accept sysctl options: @@ -112,10 +117,6 @@ tcp_l3mdev_accept and udp_l3mdev_accept sysctl options: netfilter rules on the VRF device can be used to limit access to services running in the default VRF context as well. -The default VRF does not have limited scope with respect to port bindings. -That is, if a process does a wildcard bind to a port in the default VRF it -owns the port across all VRF domains within the network namespace. - Using iproute2 for VRFs diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 6e91e38a31da..9db98af46985 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -115,9 +115,8 @@ int inet6_hash(struct sock *sk); ((__sk)->sk_family == AF_INET6)&& \ ipv6_addr_equal(&(__sk)->sk_v6_daddr, (__saddr)) && \ ipv6_addr_equal(&(__sk)->sk_v6_rcv_saddr, (__daddr)) && \ -(!(__sk)->sk_bound_dev_if || \ - ((__sk)->sk_bound_dev_if == (__dif)) || \ - ((__sk)->sk_bound_dev_if == (__sdif))) && \ +(((__sk)->sk_bound_dev_if == (__dif)) || \ + ((__sk)->sk_bound_dev_if == (__sdif)))&& \ net_eq(sock_net(__sk), (__net))) #endif /* _INET6_HASHTABLES_H */ diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 9141e95529e7..866efd35ded4 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -79,6 +79,7 @@ struct inet_ehash_bucket { struct inet_bind_bucket { possible_net_t ib_net; + int l3mdev; unsigned short port; signed char fastreuse; signed char fastreuseport; @@ -188,10 +189,28 @@ static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) hashinfo->ehash_locks = NULL; } +#ifdef CONFIG_NET_L3_MASTER_DEV +static inline bool inet_sk_bound_dev_eq(struct ne
[PATCH net-next v1 0/5] vrf: allow simultaneous service instances in default and other VRFs
Services currently have to be VRF-aware if they are using an unbound socket. One cannot have multiple service instances running in the default and other VRFs for services that are not VRF-aware and listen on an unbound socket. This is because there is no way of isolating packets received in the default VRF from those arriving in other VRFs. This series provides this isolation subject to the existing kernel parameter net.ipv4.tcp_l3mdev_accept not being set, given that this is documented as allowing a single service instance to work across all VRF domains. The functionality applies to UDP & TCP services, for IPv4 and IPv6, in particular adding VRF table handling for IPv6 multicast. Example of running ssh instances in default and blue VRF: $ /usr/sbin/sshd -D $ ip vrf exec vrf-blue /usr/sbin/sshd $ ss -ta | egrep 'State|ssh' State Recv-Q Send-Q Local Address:Port Peer Address:Port LISTEN 0128 0.0.0.0%vrf-blue:ssh 0.0.0.0:* LISTEN 01280.0.0.0:ssh 0.0.0.0:* ESTAB 00 192.168.122.220:ssh 192.168.122.1:50282 LISTEN 0128 [::]%vrf-blue:ssh[::]:* LISTEN 0128 [::]:ssh[::]:* ESTAB 00 [3000::2]%vrf-blue:ssh [3000::9]:45896 ESTAB 00[2000::2]:ssh [2000::9]:46398 v1: - Address Paolo Abeni's comments (patch 4/5) - Fix build when CONFIG_NET_L3_MASTER_DEV not defined (patch 1/5) Dewi Morgan (1): ipv6: do not drop vrf udp multicast packets Mike Manning (1): ipv6: allow link-local and multicast packets inside vrf Patrick Ruddy (1): ipv6: add vrf table handling code for ipv6 mcast Robert Shearman (2): net: allow binding socket in a VRF when there's an unbound socket ipv4: Allow sending multicast packets on specific i/f using VRF socket Documentation/networking/vrf.txt | 9 drivers/net/vrf.c| 30 include/net/inet6_hashtables.h | 5 ++-- include/net/inet_hashtables.h| 31 +++-- include/net/inet_sock.h | 13 +++ net/core/sock.c | 2 ++ net/ipv4/datagram.c | 2 +- net/ipv4/inet_connection_sock.c | 13 --- net/ipv4/inet_hashtables.c | 34 +--- net/ipv4/ip_sockglue.c | 3 +++ net/ipv4/ping.c | 2 +- net/ipv4/raw.c | 6 ++--- net/ipv4/udp.c | 17 ++ net/ipv6/datagram.c | 5 +++- net/ipv6/inet6_hashtables.c | 14 +--- net/ipv6/ip6_input.c | 43 +++ net/ipv6/ip6mr.c | 49 ++-- net/ipv6/ipv6_sockglue.c | 5 +++- net/ipv6/raw.c | 6 ++--- net/ipv6/udp.c | 22 -- 20 files changed, 214 insertions(+), 97 deletions(-) -- 2.11.0
[PATCH net-next v1 3/5] ipv4: Allow sending multicast packets on specific i/f using VRF socket
From: Robert Shearman It is useful to be able to use the same socket for listening in a specific VRF, as for sending multicast packets out of a specific interface. However, the bound device on the socket currently takes precedence and results in the packets not being sent. Relax the condition on overriding the output interface to use for sending packets out of UDP, raw and ping sockets to allow multicast packets to be sent using the specified multicast interface. Signed-off-by: Robert Shearman Signed-off-by: Mike Manning --- net/ipv4/datagram.c | 2 +- net/ipv4/ping.c | 2 +- net/ipv4/raw.c | 2 +- net/ipv4/udp.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index f915abff1350..300921417f89 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -42,7 +42,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len oif = sk->sk_bound_dev_if; saddr = inet->inet_saddr; if (ipv4_is_multicast(usin->sin_addr.s_addr)) { - if (!oif) + if (!oif || netif_index_is_l3_master(sock_net(sk), oif)) oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 8d7aaf118a30..7ccb5f87f70b 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -779,7 +779,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } if (ipv4_is_multicast(daddr)) { - if (!ipc.oif) + if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 8a0d568d7aec..c55ef53d87a8 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -608,7 +608,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) tos |= RTO_ONLINK; if (ipv4_is_multicast(daddr)) { - if (!ipc.oif) + if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3d59ab47a85d..f81097843031 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1039,7 +1039,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } if (ipv4_is_multicast(daddr)) { - if (!ipc.oif) + if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; -- 2.11.0
[PATCH net-next v1 2/5] ipv6: allow link-local and multicast packets inside vrf
Packets that are multicast or to link-local addresses are not enslaved to the vrf of the socket that they are received on. This is needed for NDISC, but breaks applications that rely on receiving such packets when in a VRF. Also to make IPv6 consistent with IPv4 which does handle multicast packets as being enslaved, modify the VRF driver to do the same for IPv6. As a result, the multicast address check needs to verify the address against the enslaved rather than the l3mdev device. Signed-off-by: Mike Manning --- drivers/net/vrf.c| 19 +-- net/ipv6/ip6_input.c | 19 ++- net/ipv6/ipv6_sockglue.c | 2 +- 3 files changed, 28 insertions(+), 12 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index f93547f257fb..9d817c19f3b4 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -981,24 +981,23 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { int orig_iif = skb->skb_iif; - bool need_strict; + bool need_strict = rt6_need_strict(_hdr(skb)->daddr); + bool is_ndisc = ipv6_ndisc_frame(skb); - /* loopback traffic; do not push through packet taps again. -* Reset pkt_type for upper layers to process skb + /* loopback, multicast & non-ND link-local traffic; do not push through +* packet taps again. Reset pkt_type for upper layers to process skb */ - if (skb->pkt_type == PACKET_LOOPBACK) { + if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) { skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; IP6CB(skb)->flags |= IP6SKB_L3SLAVE; - skb->pkt_type = PACKET_HOST; + if (skb->pkt_type == PACKET_LOOPBACK) + skb->pkt_type = PACKET_HOST; goto out; } - /* if packet is NDISC or addressed to multicast or link-local -* then keep the ingress interface -*/ - need_strict = rt6_need_strict(_hdr(skb)->daddr); - if (!ipv6_ndisc_frame(skb) && !need_strict) { + /* if packet is NDISC then keep the ingress interface */ + if (!is_ndisc) { vrf_rx_stats(vrf_dev, skb->len); skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 96577e742afd..108f5f88ec98 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -432,15 +432,32 @@ EXPORT_SYMBOL_GPL(ip6_input); int ip6_mc_input(struct sk_buff *skb) { + int sdif = inet6_sdif(skb); const struct ipv6hdr *hdr; + struct net_device *dev; bool deliver; __IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev), __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST, skb->len); + /* skb->dev passed may be master dev for vrfs. */ + if (sdif) { + rcu_read_lock(); + dev = dev_get_by_index_rcu(dev_net(skb->dev), sdif); + if (!dev) { + rcu_read_unlock(); + kfree_skb(skb); + return -ENODEV; + } + } else { + dev = skb->dev; + } + hdr = ipv6_hdr(skb); - deliver = ipv6_chk_mcast_addr(skb->dev, >daddr, NULL); + deliver = ipv6_chk_mcast_addr(dev, >daddr, NULL); + if (sdif) + rcu_read_unlock(); #ifdef CONFIG_IPV6_MROUTE /* diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 7dfbc797b130..4ebd395dd3df 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -486,7 +486,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, retv = -EFAULT; break; } - if (sk->sk_bound_dev_if && pkt.ipi6_ifindex != sk->sk_bound_dev_if) + if (!sk_dev_equal_l3scope(sk, pkt.ipi6_ifindex)) goto e_inval; np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex; -- 2.11.0
[PATCH net-next v1 4/5] ipv6: do not drop vrf udp multicast packets
From: Dewi Morgan For bound udp sockets in a vrf, also check the sdif to get the index for ingress devices enslaved to an l3mdev. Verify the multicast address against the enslaved rather than the l3mdev device. Signed-off-by: Dewi Morgan Signed-off-by: Mike Manning --- net/ipv6/ip6_input.c | 24 net/ipv6/udp.c | 8 +--- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 108f5f88ec98..82ffb5cdd2ab 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -324,11 +324,14 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt, static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { const struct inet6_protocol *ipprot; + int sdif = inet6_sdif(skb); + bool have_final = false; struct inet6_dev *idev; + struct net_device *dev; unsigned int nhoff; + bool deliver; int nexthdr; bool raw; - bool have_final = false; /* * Parse extension headers @@ -371,9 +374,22 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); hdr = ipv6_hdr(skb); - if (ipv6_addr_is_multicast(>daddr) && - !ipv6_chk_mcast_addr(skb->dev, >daddr, - >saddr) && + + /* skb->dev passed may be master dev for vrfs. */ + if (sdif) { + dev = dev_get_by_index_rcu(dev_net(skb->dev), + sdif); + if (!dev) { + kfree_skb(skb); + return -ENODEV; + } + } else { + dev = skb->dev; + } + + deliver = ipv6_chk_mcast_addr(dev, >daddr, + >saddr); + if (ipv6_addr_is_multicast(>daddr) && !deliver && !ipv6_is_mld(skb, nexthdr, skb_network_header_len(skb))) goto discard; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e22b7dd78c9b..35f71b7a1070 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -637,7 +637,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk, __be16 loc_port, const struct in6_addr *loc_addr, __be16 rmt_port, const struct in6_addr *rmt_addr, - int dif, unsigned short hnum) + int dif, int sdif, unsigned short hnum) { struct inet_sock *inet = inet_sk(sk); @@ -649,7 +649,7 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk, (inet->inet_dport && inet->inet_dport != rmt_port) || (!ipv6_addr_any(>sk_v6_daddr) && !ipv6_addr_equal(>sk_v6_daddr, rmt_addr)) || - (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) || + !inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif) || (!ipv6_addr_any(>sk_v6_rcv_saddr) && !ipv6_addr_equal(>sk_v6_rcv_saddr, loc_addr))) return false; @@ -683,6 +683,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, unsigned int offset = offsetof(typeof(*sk), sk_node); unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); int dif = inet6_iif(skb); + int sdif = inet6_sdif(skb); struct hlist_node *node; struct sk_buff *nskb; @@ -697,7 +698,8 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, sk_for_each_entry_offset_rcu(sk, node, >head, offset) { if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr, - uh->source, saddr, dif, hnum)) + uh->source, saddr, dif, sdif, + hnum)) continue; /* If zero checksum and no_check is not on for * the socket then skip it. -- 2.11.0
[PATCH net-next v1 5/5] ipv6: add vrf table handling code for ipv6 mcast
From: Patrick Ruddy The code to obtain the correct table for the incoming interface was missing for IPv6. This has been added along with the table creation notification to fib rules for the RTNL_FAMILY_IP6MR address family. Signed-off-by: Patrick Ruddy Signed-off-by: Mike Manning --- drivers/net/vrf.c | 11 +++ net/ipv6/ip6mr.c | 49 + 2 files changed, 48 insertions(+), 12 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 9d817c19f3b4..21ad4b1d7f03 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1214,8 +1214,19 @@ static int vrf_add_fib_rules(const struct net_device *dev) goto ipmr_err; #endif +#if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES) + err = vrf_fib_rule(dev, RTNL_FAMILY_IP6MR, true); + if (err < 0) + goto ip6mr_err; +#endif + return 0; +#if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES) +ip6mr_err: + vrf_fib_rule(dev, RTNL_FAMILY_IPMR, false); +#endif + #if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES) ipmr_err: vrf_fib_rule(dev, AF_INET6, false); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index d0b7e0249c13..1ecc88456dc5 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -85,7 +85,8 @@ static struct mr_table *ip6mr_new_table(struct net *net, u32 id); static void ip6mr_free_table(struct mr_table *mrt); static void ip6_mr_forward(struct net *net, struct mr_table *mrt, - struct sk_buff *skb, struct mfc6_cache *cache); + struct net_device *dev, struct sk_buff *skb, + struct mfc6_cache *cache); static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert); static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, @@ -138,6 +139,9 @@ static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, .flags = FIB_LOOKUP_NOREF, }; + /* update flow if oif or iif point to device enslaved to l3mdev */ + l3mdev_update_flow(net, flowi6_to_flowi(flp6)); + err = fib_rules_lookup(net->ipv6.mr6_rules_ops, flowi6_to_flowi(flp6), 0, ); if (err < 0) @@ -164,7 +168,9 @@ static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp, return -EINVAL; } - mrt = ip6mr_get_table(rule->fr_net, rule->table); + arg->table = fib_rule_get_table(rule, arg); + + mrt = ip6mr_get_table(rule->fr_net, arg->table); if (!mrt) return -EAGAIN; res->mrt = mrt; @@ -1014,7 +1020,7 @@ static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt, } rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else - ip6_mr_forward(net, mrt, skb, c); + ip6_mr_forward(net, mrt, skb->dev, skb, c); } } @@ -1120,7 +1126,7 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, /* Queue a packet for resolution. It gets locked cache entry! */ static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, - struct sk_buff *skb) + struct sk_buff *skb, struct net_device *dev) { struct mfc6_cache *c; bool found = false; @@ -1180,6 +1186,10 @@ static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, kfree_skb(skb); err = -ENOBUFS; } else { + if (dev) { + skb->dev = dev; + skb->skb_iif = dev->ifindex; + } skb_queue_tail(>_c.mfc_un.unres.unresolved, skb); err = 0; } @@ -2043,11 +2053,12 @@ static int ip6mr_find_vif(struct mr_table *mrt, struct net_device *dev) } static void ip6_mr_forward(struct net *net, struct mr_table *mrt, - struct sk_buff *skb, struct mfc6_cache *c) + struct net_device *dev, struct sk_buff *skb, + struct mfc6_cache *c) { int psend = -1; int vif, ct; - int true_vifi = ip6mr_find_vif(mrt, skb->dev); + int true_vifi = ip6mr_find_vif(mrt, dev); vif = c->_c.mfc_parent; c->_c.mfc_un.res.pkt++; @@ -2073,7 +2084,7 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, /* * Wrong interface: drop packet and (maybe) send PIM assert. */ - if (mrt->vif_table[vif].dev != skb->dev) { + if (mrt->vif_table[vif].dev != dev) { c->_c.mfc_un.res.wrong_if++; if (true_vifi >= 0 && mrt->mroute_do_assert && @@ -2146,6 +2157,7 @@ static void ip6_mr_forward(struct net *n
Re: [PATCH net-next 4/5] ipv6: do not drop vrf udp multicast packets
On 20/09/2018 14:02, Paolo Abeni wrote: > Hi, > > On Thu, 2018-09-20 at 09:58 +0100, Mike Manning wrote: >> diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c >> index 108f5f88ec98..fc60f297d95b 100644 >> --- a/net/ipv6/ip6_input.c >> +++ b/net/ipv6/ip6_input.c >> @@ -325,9 +325,12 @@ static int ip6_input_finish(struct net *net, struct >> sock *sk, struct sk_buff *sk >> { >> const struct inet6_protocol *ipprot; >> struct inet6_dev *idev; >> +struct net_device *dev; >> unsigned int nhoff; >> +int sdif = inet6_sdif(skb); >> int nexthdr; >> bool raw; >> +bool deliver; >> bool have_final = false; > Please, try instead to sort the variable in reverse x-mas tree order. Will do. >> >> /* >> @@ -371,9 +374,27 @@ static int ip6_input_finish(struct net *net, struct >> sock *sk, struct sk_buff *sk >> skb_postpull_rcsum(skb, skb_network_header(skb), >> skb_network_header_len(skb)); >> hdr = ipv6_hdr(skb); >> -if (ipv6_addr_is_multicast(>daddr) && >> -!ipv6_chk_mcast_addr(skb->dev, >daddr, >> ->saddr) && >> + >> +/* skb->dev passed may be master dev for vrfs. */ >> +if (sdif) { >> +rcu_read_lock(); > AFAICS, the rcu lock is already acquired at the beginning of > ip6_input_finish(), not need to acquire it here again. Nice catch, I will remove this. > + dev = dev_get_by_index_rcu(dev_net(skb->dev), >> + sdif); >> +if (!dev) { >> +rcu_read_unlock(); >> +kfree_skb(skb); >> +return -ENODEV; >> +} >> +} else { >> +dev = skb->dev; > The above fragment of code is a recurring pattern in this series, > perhaps adding an helper for it would reduce code duplication ? This pattern of checking the secondary device index is used only twice, both in this file. But with now one instance having the rcu lock handling, and the other not, I cannot refactor this. > > Cheers, > > Paolo > Thanks for the review! I will wait for further comments before producing a v1 of the series. Regards, Mike
[PATCH net-next 0/5] vrf: allow simultaneous service instances in default and other VRFs
Services currently have to be VRF-aware if they are using an unbound socket. One cannot have multiple service instances running in the default and other VRFs for services that are not VRF-aware and listen on an unbound socket. This is because there is no way of isolating packets received in the default VRF from those arriving in other VRFs. This series provides this isolation subject to the existing kernel parameter net.ipv4.tcp_l3mdev_accept not being set, given that this is documented as allowing a single service instance to work across all VRF domains. The functionality applies to UDP & TCP services, for IPv4 and IPv6, in particular adding VRF table handling for IPv6 multicast. Example of running ssh instances in default and blue VRF: $ /usr/sbin/sshd -D $ ip vrf exec vrf-blue /usr/sbin/sshd $ ss -ta | egrep 'State|ssh' State Recv-Q Send-Q Local Address:Port Peer Address:Port LISTEN 0128 0.0.0.0%vrf-blue:ssh 0.0.0.0:* LISTEN 01280.0.0.0:ssh 0.0.0.0:* ESTAB 00 192.168.122.220:ssh 192.168.122.1:50282 LISTEN 0128 [::]%vrf-blue:ssh[::]:* LISTEN 0128 [::]:ssh[::]:* ESTAB 00 [3000::2]%vrf-blue:ssh [3000::9]:45896 ESTAB 00[2000::2]:ssh [2000::9]:46398 Dewi Morgan (1): ipv6: do not drop vrf udp multicast packets Mike Manning (1): ipv6: allow link-local and multicast packets inside vrf Patrick Ruddy (1): ipv6: add vrf table handling code for ipv6 mcast Robert Shearman (2): net: allow binding socket in a VRF when there's an unbound socket ipv4: Allow sending multicast packets on specific i/f using VRF socket Documentation/networking/vrf.txt | 9 drivers/net/vrf.c| 30 include/net/inet6_hashtables.h | 5 ++-- include/net/inet_hashtables.h| 21 +++-- include/net/inet_sock.h | 13 +++ net/core/sock.c | 2 ++ net/ipv4/datagram.c | 2 +- net/ipv4/inet_connection_sock.c | 13 --- net/ipv4/inet_hashtables.c | 34 +--- net/ipv4/ip_sockglue.c | 3 +++ net/ipv4/ping.c | 2 +- net/ipv4/raw.c | 6 ++--- net/ipv4/udp.c | 17 ++ net/ipv6/datagram.c | 5 +++- net/ipv6/inet6_hashtables.c | 14 +--- net/ipv6/ip6_input.c | 46 + net/ipv6/ip6mr.c | 49 ++-- net/ipv6/ipv6_sockglue.c | 5 +++- net/ipv6/raw.c | 6 ++--- net/ipv6/udp.c | 22 -- 20 files changed, 208 insertions(+), 96 deletions(-) -- 2.11.0
[PATCH net-next 2/5] ipv6: allow link-local and multicast packets inside vrf
Packets that are multicast or to link-local addresses are not enslaved to the vrf of the socket that they are received on. This is needed for NDISC, but breaks applications that rely on receiving such packets when in a VRF. Also to make IPv6 consistent with IPv4 which does handle multicast packets as being enslaved, modify the VRF driver to do the same for IPv6. As a result, the multicast address check needs to verify the address against the enslaved rather than the l3mdev device. Signed-off-by: Mike Manning --- drivers/net/vrf.c| 19 +-- net/ipv6/ip6_input.c | 19 ++- net/ipv6/ipv6_sockglue.c | 2 +- 3 files changed, 28 insertions(+), 12 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index f93547f257fb..9d817c19f3b4 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -981,24 +981,23 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { int orig_iif = skb->skb_iif; - bool need_strict; + bool need_strict = rt6_need_strict(_hdr(skb)->daddr); + bool is_ndisc = ipv6_ndisc_frame(skb); - /* loopback traffic; do not push through packet taps again. -* Reset pkt_type for upper layers to process skb + /* loopback, multicast & non-ND link-local traffic; do not push through +* packet taps again. Reset pkt_type for upper layers to process skb */ - if (skb->pkt_type == PACKET_LOOPBACK) { + if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) { skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; IP6CB(skb)->flags |= IP6SKB_L3SLAVE; - skb->pkt_type = PACKET_HOST; + if (skb->pkt_type == PACKET_LOOPBACK) + skb->pkt_type = PACKET_HOST; goto out; } - /* if packet is NDISC or addressed to multicast or link-local -* then keep the ingress interface -*/ - need_strict = rt6_need_strict(_hdr(skb)->daddr); - if (!ipv6_ndisc_frame(skb) && !need_strict) { + /* if packet is NDISC then keep the ingress interface */ + if (!is_ndisc) { vrf_rx_stats(vrf_dev, skb->len); skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 96577e742afd..108f5f88ec98 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -432,15 +432,32 @@ EXPORT_SYMBOL_GPL(ip6_input); int ip6_mc_input(struct sk_buff *skb) { + int sdif = inet6_sdif(skb); const struct ipv6hdr *hdr; + struct net_device *dev; bool deliver; __IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev), __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST, skb->len); + /* skb->dev passed may be master dev for vrfs. */ + if (sdif) { + rcu_read_lock(); + dev = dev_get_by_index_rcu(dev_net(skb->dev), sdif); + if (!dev) { + rcu_read_unlock(); + kfree_skb(skb); + return -ENODEV; + } + } else { + dev = skb->dev; + } + hdr = ipv6_hdr(skb); - deliver = ipv6_chk_mcast_addr(skb->dev, >daddr, NULL); + deliver = ipv6_chk_mcast_addr(dev, >daddr, NULL); + if (sdif) + rcu_read_unlock(); #ifdef CONFIG_IPV6_MROUTE /* diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 7dfbc797b130..4ebd395dd3df 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -486,7 +486,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, retv = -EFAULT; break; } - if (sk->sk_bound_dev_if && pkt.ipi6_ifindex != sk->sk_bound_dev_if) + if (!sk_dev_equal_l3scope(sk, pkt.ipi6_ifindex)) goto e_inval; np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex; -- 2.11.0
[PATCH net-next 5/5] ipv6: add vrf table handling code for ipv6 mcast
From: Patrick Ruddy The code to obtain the correct table for the incoming interface was missing for IPv6. This has been added along with the table creation notification to fib rules for the RTNL_FAMILY_IP6MR address family. Signed-off-by: Patrick Ruddy Signed-off-by: Mike Manning --- drivers/net/vrf.c | 11 +++ net/ipv6/ip6mr.c | 49 + 2 files changed, 48 insertions(+), 12 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 9d817c19f3b4..21ad4b1d7f03 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1214,8 +1214,19 @@ static int vrf_add_fib_rules(const struct net_device *dev) goto ipmr_err; #endif +#if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES) + err = vrf_fib_rule(dev, RTNL_FAMILY_IP6MR, true); + if (err < 0) + goto ip6mr_err; +#endif + return 0; +#if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES) +ip6mr_err: + vrf_fib_rule(dev, RTNL_FAMILY_IPMR, false); +#endif + #if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES) ipmr_err: vrf_fib_rule(dev, AF_INET6, false); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index d0b7e0249c13..1ecc88456dc5 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -85,7 +85,8 @@ static struct mr_table *ip6mr_new_table(struct net *net, u32 id); static void ip6mr_free_table(struct mr_table *mrt); static void ip6_mr_forward(struct net *net, struct mr_table *mrt, - struct sk_buff *skb, struct mfc6_cache *cache); + struct net_device *dev, struct sk_buff *skb, + struct mfc6_cache *cache); static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert); static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, @@ -138,6 +139,9 @@ static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, .flags = FIB_LOOKUP_NOREF, }; + /* update flow if oif or iif point to device enslaved to l3mdev */ + l3mdev_update_flow(net, flowi6_to_flowi(flp6)); + err = fib_rules_lookup(net->ipv6.mr6_rules_ops, flowi6_to_flowi(flp6), 0, ); if (err < 0) @@ -164,7 +168,9 @@ static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp, return -EINVAL; } - mrt = ip6mr_get_table(rule->fr_net, rule->table); + arg->table = fib_rule_get_table(rule, arg); + + mrt = ip6mr_get_table(rule->fr_net, arg->table); if (!mrt) return -EAGAIN; res->mrt = mrt; @@ -1014,7 +1020,7 @@ static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt, } rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else - ip6_mr_forward(net, mrt, skb, c); + ip6_mr_forward(net, mrt, skb->dev, skb, c); } } @@ -1120,7 +1126,7 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, /* Queue a packet for resolution. It gets locked cache entry! */ static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, - struct sk_buff *skb) + struct sk_buff *skb, struct net_device *dev) { struct mfc6_cache *c; bool found = false; @@ -1180,6 +1186,10 @@ static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, kfree_skb(skb); err = -ENOBUFS; } else { + if (dev) { + skb->dev = dev; + skb->skb_iif = dev->ifindex; + } skb_queue_tail(>_c.mfc_un.unres.unresolved, skb); err = 0; } @@ -2043,11 +2053,12 @@ static int ip6mr_find_vif(struct mr_table *mrt, struct net_device *dev) } static void ip6_mr_forward(struct net *net, struct mr_table *mrt, - struct sk_buff *skb, struct mfc6_cache *c) + struct net_device *dev, struct sk_buff *skb, + struct mfc6_cache *c) { int psend = -1; int vif, ct; - int true_vifi = ip6mr_find_vif(mrt, skb->dev); + int true_vifi = ip6mr_find_vif(mrt, dev); vif = c->_c.mfc_parent; c->_c.mfc_un.res.pkt++; @@ -2073,7 +2084,7 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, /* * Wrong interface: drop packet and (maybe) send PIM assert. */ - if (mrt->vif_table[vif].dev != skb->dev) { + if (mrt->vif_table[vif].dev != dev) { c->_c.mfc_un.res.wrong_if++; if (true_vifi >= 0 && mrt->mroute_do_assert && @@ -2146,6 +2157,7 @@ static void ip6_mr_forward(struct net *n
[PATCH net-next 4/5] ipv6: do not drop vrf udp multicast packets
From: Dewi Morgan For bound udp sockets in a vrf, also check the sdif to get the index for ingress devices enslaved to an l3mdev. Verify the multicast address against the enslaved rather than the l3mdev device. Signed-off-by: Dewi Morgan Signed-off-by: Mike Manning --- net/ipv6/ip6_input.c | 27 --- net/ipv6/udp.c | 8 +--- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 108f5f88ec98..fc60f297d95b 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -325,9 +325,12 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk { const struct inet6_protocol *ipprot; struct inet6_dev *idev; + struct net_device *dev; unsigned int nhoff; + int sdif = inet6_sdif(skb); int nexthdr; bool raw; + bool deliver; bool have_final = false; /* @@ -371,9 +374,27 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); hdr = ipv6_hdr(skb); - if (ipv6_addr_is_multicast(>daddr) && - !ipv6_chk_mcast_addr(skb->dev, >daddr, - >saddr) && + + /* skb->dev passed may be master dev for vrfs. */ + if (sdif) { + rcu_read_lock(); + dev = dev_get_by_index_rcu(dev_net(skb->dev), + sdif); + if (!dev) { + rcu_read_unlock(); + kfree_skb(skb); + return -ENODEV; + } + } else { + dev = skb->dev; + } + + deliver = ipv6_chk_mcast_addr(dev, >daddr, + >saddr); + if (sdif) + rcu_read_unlock(); + + if (ipv6_addr_is_multicast(>daddr) && !deliver && !ipv6_is_mld(skb, nexthdr, skb_network_header_len(skb))) goto discard; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e22b7dd78c9b..35f71b7a1070 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -637,7 +637,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk, __be16 loc_port, const struct in6_addr *loc_addr, __be16 rmt_port, const struct in6_addr *rmt_addr, - int dif, unsigned short hnum) + int dif, int sdif, unsigned short hnum) { struct inet_sock *inet = inet_sk(sk); @@ -649,7 +649,7 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk, (inet->inet_dport && inet->inet_dport != rmt_port) || (!ipv6_addr_any(>sk_v6_daddr) && !ipv6_addr_equal(>sk_v6_daddr, rmt_addr)) || - (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) || + !inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif) || (!ipv6_addr_any(>sk_v6_rcv_saddr) && !ipv6_addr_equal(>sk_v6_rcv_saddr, loc_addr))) return false; @@ -683,6 +683,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, unsigned int offset = offsetof(typeof(*sk), sk_node); unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); int dif = inet6_iif(skb); + int sdif = inet6_sdif(skb); struct hlist_node *node; struct sk_buff *nskb; @@ -697,7 +698,8 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, sk_for_each_entry_offset_rcu(sk, node, >head, offset) { if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr, - uh->source, saddr, dif, hnum)) + uh->source, saddr, dif, sdif, + hnum)) continue; /* If zero checksum and no_check is not on for * the socket then skip it. -- 2.11.0
[PATCH net-next 1/5] net: allow binding socket in a VRF when there's an unbound socket
From: Robert Shearman There is no easy way currently for applications that want to receive packets in the default VRF to be isolated from packets arriving in VRFs, which makes using VRF-unaware applications in a VRF-aware system a potential security risk. So change the inet socket lookup to avoid packets arriving on a device enslaved to an l3mdev from matching unbound sockets by removing the wildcard for non sk_bound_dev_if and instead relying on check against the secondary device index, which will be 0 when the input device is not enslaved to an l3mdev and so match against an unbound socket and not match when the input device is enslaved. The existing net.ipv4.tcp_l3mdev_accept & net.ipv4.udp_l3mdev_accept sysctls, which are documented as allowing the working across all VRF domains, can be used to also work in the default VRF by causing unbound sockets to match against packets arriving on a device enslaved to an l3mdev. Change the socket binding to take the l3mdev into account to allow an unbound socket to not conflict sockets bound to an l3mdev given the datapath isolation now guaranteed. Signed-off-by: Robert Shearman Signed-off-by: Mike Manning --- Documentation/networking/vrf.txt | 9 + include/net/inet6_hashtables.h | 5 ++--- include/net/inet_hashtables.h| 21 ++--- include/net/inet_sock.h | 13 + net/core/sock.c | 2 ++ net/ipv4/inet_connection_sock.c | 13 ++--- net/ipv4/inet_hashtables.c | 34 +- net/ipv4/ip_sockglue.c | 3 +++ net/ipv4/raw.c | 4 ++-- net/ipv4/udp.c | 15 ++- net/ipv6/datagram.c | 5 - net/ipv6/inet6_hashtables.c | 14 ++ net/ipv6/ipv6_sockglue.c | 3 +++ net/ipv6/raw.c | 6 +++--- net/ipv6/udp.c | 14 +- 15 files changed, 99 insertions(+), 62 deletions(-) diff --git a/Documentation/networking/vrf.txt b/Documentation/networking/vrf.txt index 8ff7b4c8f91b..d4b129402d57 100644 --- a/Documentation/networking/vrf.txt +++ b/Documentation/networking/vrf.txt @@ -103,6 +103,11 @@ VRF device: or to specify the output device using cmsg and IP_PKTINFO. +By default the scope of the port bindings for unbound sockets is +limited to the default VRF. That is, it will not be matched by packets +arriving on interfaces enslaved to an l3mdev and processes may bind to +the same port if they bind to an l3mdev. + TCP & UDP services running in the default VRF context (ie., not bound to any VRF device) can work across all VRF domains by enabling the tcp_l3mdev_accept and udp_l3mdev_accept sysctl options: @@ -112,10 +117,6 @@ tcp_l3mdev_accept and udp_l3mdev_accept sysctl options: netfilter rules on the VRF device can be used to limit access to services running in the default VRF context as well. -The default VRF does not have limited scope with respect to port bindings. -That is, if a process does a wildcard bind to a port in the default VRF it -owns the port across all VRF domains within the network namespace. - Using iproute2 for VRFs diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 6e91e38a31da..9db98af46985 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -115,9 +115,8 @@ int inet6_hash(struct sock *sk); ((__sk)->sk_family == AF_INET6)&& \ ipv6_addr_equal(&(__sk)->sk_v6_daddr, (__saddr)) && \ ipv6_addr_equal(&(__sk)->sk_v6_rcv_saddr, (__daddr)) && \ -(!(__sk)->sk_bound_dev_if || \ - ((__sk)->sk_bound_dev_if == (__dif)) || \ - ((__sk)->sk_bound_dev_if == (__sdif))) && \ +(((__sk)->sk_bound_dev_if == (__dif)) || \ + ((__sk)->sk_bound_dev_if == (__sdif)))&& \ net_eq(sock_net(__sk), (__net))) #endif /* _INET6_HASHTABLES_H */ diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 9141e95529e7..ec279bcd0958 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -79,6 +79,7 @@ struct inet_ehash_bucket { struct inet_bind_bucket { possible_net_t ib_net; + int l3mdev; unsigned short port; signed char fastreuse; signed char fastreuseport; @@ -188,10 +189,18 @@ static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) hashinfo->ehash_locks = NULL; } +static inline bool inet_sk_bound_dev_eq(struct net *net, int bound_dev_if, +
[PATCH net-next 3/5] ipv4: Allow sending multicast packets on specific i/f using VRF socket
From: Robert Shearman It is useful to be able to use the same socket for listening in a specific VRF, as for sending multicast packets out of a specific interface. However, the bound device on the socket currently takes precedence and results in the packets not being sent. Relax the condition on overriding the output interface to use for sending packets out of UDP, raw and ping sockets to allow multicast packets to be sent using the specified multicast interface. Signed-off-by: Robert Shearman Signed-off-by: Mike Manning --- net/ipv4/datagram.c | 2 +- net/ipv4/ping.c | 2 +- net/ipv4/raw.c | 2 +- net/ipv4/udp.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index f915abff1350..300921417f89 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -42,7 +42,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len oif = sk->sk_bound_dev_if; saddr = inet->inet_saddr; if (ipv4_is_multicast(usin->sin_addr.s_addr)) { - if (!oif) + if (!oif || netif_index_is_l3_master(sock_net(sk), oif)) oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 8d7aaf118a30..7ccb5f87f70b 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -779,7 +779,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } if (ipv4_is_multicast(daddr)) { - if (!ipc.oif) + if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 8a0d568d7aec..c55ef53d87a8 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -608,7 +608,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) tos |= RTO_ONLINK; if (ipv4_is_multicast(daddr)) { - if (!ipc.oif) + if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3d59ab47a85d..f81097843031 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1039,7 +1039,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } if (ipv4_is_multicast(daddr)) { - if (!ipc.oif) + if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = inet->mc_index; if (!saddr) saddr = inet->mc_addr; -- 2.11.0
[PATCH net-next] ipv6: Allow the l3mdev to be a loopback
From: Robert Shearman There is no way currently for an IPv6 client connect using a loopback address in a VRF, whereas for IPv4 the loopback address can be added: $ sudo ip addr add dev vrfred 127.0.0.1/8 $ sudo ip -6 addr add ::1/128 dev vrfred RTNETLINK answers: Cannot assign requested address So allow ::1 to be configured on an L3 master device. In order for this to be usable ip_route_output_flags needs to not consider ::1 to be a link scope address (since oif == l3mdev and so it would be dropped), and ipv6_rcv needs to consider the l3mdev to be a loopback device so that it doesn't drop the packets. Signed-off-by: Robert Shearman Signed-off-by: Mike Manning --- net/ipv6/addrconf.c | 1 + net/ipv6/ip6_input.c | 3 ++- net/ipv6/route.c | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index d4733160e6b7..bfe3ec7ecb14 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -997,6 +997,7 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg, if (addr_type == IPV6_ADDR_ANY || addr_type & IPV6_ADDR_MULTICAST || (!(idev->dev->flags & IFF_LOOPBACK) && +!netif_is_l3_master(idev->dev) && addr_type & IPV6_ADDR_LOOPBACK)) return ERR_PTR(-EADDRNOTAVAIL); diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 6242682be876..96577e742afd 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -178,7 +178,8 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev, */ if ((ipv6_addr_loopback(>saddr) || ipv6_addr_loopback(>daddr)) && -!(dev->flags & IFF_LOOPBACK)) + !(dev->flags & IFF_LOOPBACK) && + !netif_is_l3_master(dev)) goto err; /* RFC4291 Errata ID: 3480 diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0fa62acc923c..f36ee8a3314f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2098,7 +2098,8 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, { bool any_src; - if (rt6_need_strict(>daddr)) { + if (ipv6_addr_type(>daddr) & + (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { struct dst_entry *dst; dst = l3mdev_link_scope_lookup(net, fl6); -- 2.11.0
[PATCH net-next] ipv6: Allow the l3mdev to be a loopback
There is no way currently for an IPv6 client connect using a loopback address in a VRF, whereas for IPv4 the loopback address can be added: $ sudo ip addr add dev vrfred 127.0.0.1/8 $ sudo ip -6 addr add ::1/128 dev vrfred RTNETLINK answers: Cannot assign requested address So allow ::1 to be configured on an L3 master device. In order for this to be usable ip_route_output_flags needs to not consider ::1 to be a link scope address (since oif == l3mdev and so it would be dropped), and ipv6_rcv needs to consider the l3mdev to be a loopback device so that it doesn't drop the packets. Signed-off-by: Robert Shearman --- net/ipv6/addrconf.c | 1 + net/ipv6/ip6_input.c | 3 ++- net/ipv6/route.c | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index d4733160e6b7..bfe3ec7ecb14 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -997,6 +997,7 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg, if (addr_type == IPV6_ADDR_ANY || addr_type & IPV6_ADDR_MULTICAST || (!(idev->dev->flags & IFF_LOOPBACK) && +!netif_is_l3_master(idev->dev) && addr_type & IPV6_ADDR_LOOPBACK)) return ERR_PTR(-EADDRNOTAVAIL); diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 6242682be876..96577e742afd 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -178,7 +178,8 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev, */ if ((ipv6_addr_loopback(>saddr) || ipv6_addr_loopback(>daddr)) && -!(dev->flags & IFF_LOOPBACK)) + !(dev->flags & IFF_LOOPBACK) && + !netif_is_l3_master(dev)) goto err; /* RFC4291 Errata ID: 3480 diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0fa62acc923c..f36ee8a3314f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2098,7 +2098,8 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, { bool any_src; - if (rt6_need_strict(>daddr)) { + if (ipv6_addr_type(>daddr) & + (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { struct dst_entry *dst; dst = l3mdev_link_scope_lookup(net, fl6); -- 2.11.0
[PATCH] net: allow interface to be set into VRF if VLAN interface in same VRF
Setting an interface into a VRF fails with 'RTNETLINK answers: File exists' if one of its VLAN interfaces is already in the same VRF. As the VRF is an upper device of the VLAN interface, it is also showing up as an upper device of the interface itself. The solution is to restrict this check to devices other than master. As only one master device can be linked to a device, the check in this case is that the upper device (VRF) being linked to is not the same as the master device instead of it not being any one of the upper devices. The following example shows an interface ens12 (with a VLAN interface ens12.10) being set into VRF green, which behaves as expected: # ip link add link ens12 ens12.10 type vlan id 10 # ip link set dev ens12 master vrfgreen # ip link show dev ens12 3: ens12: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel master vrfgreen state UP mode DEFAULT group default qlen 1000 link/ether 52:54:00:4c:a0:45 brd ff:ff:ff:ff:ff:ff But if the VLAN interface has previously been set into the same VRF, then setting the interface into the VRF fails: # ip link set dev ens12 nomaster # ip link set dev ens12.10 master vrfgreen # ip link show dev ens12.10 39: ens12.10@ens12: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue master vrfgreen state UP mode DEFAULT group default qlen 1000 link/ether 52:54:00:4c:a0:45 brd ff:ff:ff:ff:ff:ff # ip link set dev ens12 master vrfgreen RTNETLINK answers: File exists The workaround is to move the VLAN interface back into the default VRF beforehand, but it has to be shut first so as to avoid the risk of traffic leaking from the VRF. This fix avoids needing this workaround. Signed-off-by: Mike Manning <mmann...@att.com> --- net/core/dev.c | 14 +- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index d4362be..2cedf52 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6396,6 +6396,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, .linking = true, .upper_info = upper_info, }; + struct net_device *master_dev; int ret = 0; ASSERT_RTNL(); @@ -6407,11 +6408,14 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (netdev_has_upper_dev(upper_dev, dev)) return -EBUSY; - if (netdev_has_upper_dev(dev, upper_dev)) - return -EEXIST; - - if (master && netdev_master_upper_dev_get(dev)) - return -EBUSY; + if (!master) { + if (netdev_has_upper_dev(dev, upper_dev)) + return -EEXIST; + } else { + master_dev = netdev_master_upper_dev_get(dev); + if (master_dev) + return master_dev == upper_dev ? -EEXIST : -EBUSY; + } ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, _info.info); -- 2.1.4
[PATCH] net: allow interface to be set into vrf if a vif in same vrf
Setting an interface into a vrf fails with 'RTNETLINK answers: File exists' if one of its vifs is already in the same vrf. As the vrf is an upper device of the vif, it is also showing up as an upper device of the interface itself. The solution is to restrict this check to devices other than master. As only one master device can be linked to a device, in this case the check is for the upper device (vrf) to be linked to as being the master device rather than any other upper device. Signed-off-by: Mike Manning <mmann...@att.com> --- net/core/dev.c | 14 +- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index d4362be..2cedf52 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6396,6 +6396,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, .linking = true, .upper_info = upper_info, }; + struct net_device *master_dev; int ret = 0; ASSERT_RTNL(); @@ -6407,11 +6408,14 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (netdev_has_upper_dev(upper_dev, dev)) return -EBUSY; - if (netdev_has_upper_dev(dev, upper_dev)) - return -EEXIST; - - if (master && netdev_master_upper_dev_get(dev)) - return -EBUSY; + if (!master) { + if (netdev_has_upper_dev(dev, upper_dev)) + return -EEXIST; + } else { + master_dev = netdev_master_upper_dev_get(dev); + if (master_dev) + return master_dev == upper_dev ? -EEXIST : -EBUSY; + } ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, _info.info); -- 2.1.4
[PATCH net-next] net: ipv6: send NS for DAD when link operationally up
The NS for DAD are sent on admin up as long as a valid qdisc is found. A race condition exists by which these packets will not egress the interface if the operational state of the lower device is not yet up. The solution is to delay DAD until the link is operationally up according to RFC2863. Rather than only doing this, follow the existing code checks by deferring IPv6 device initialization altogether. The fix allows DAD on devices like tunnels that are controlled by userspace control plane. The fix has no impact on regular deployments, but means that there is no IPv6 connectivity until the port has been opened in the case of port-based network access control, which should be desirable. Signed-off-by: Mike Manning <mmann...@brocade.com> --- net/ipv6/addrconf.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c2e2a78..dffbf3b 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -303,10 +303,10 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .disable_policy = 0, }; -/* Check if a valid qdisc is available */ -static inline bool addrconf_qdisc_ok(const struct net_device *dev) +/* Check if link is ready: is it up and is a valid qdisc available */ +static inline bool addrconf_link_ready(const struct net_device *dev) { - return !qdisc_tx_is_noop(dev); + return netif_oper_up(dev) && !qdisc_tx_is_noop(dev); } static void addrconf_del_rs_timer(struct inet6_dev *idev) @@ -451,7 +451,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) ndev->token = in6addr_any; - if (netif_running(dev) && addrconf_qdisc_ok(dev)) + if (netif_running(dev) && addrconf_link_ready(dev)) ndev->if_flags |= IF_READY; ipv6_mc_init_dev(ndev); @@ -3393,7 +3393,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, /* restore routes for permanent addresses */ addrconf_permanent_addr(dev); - if (!addrconf_qdisc_ok(dev)) { + if (!addrconf_link_ready(dev)) { /* device is not ready yet. */ pr_info("ADDRCONF(NETDEV_UP): %s: link is not ready\n", dev->name); @@ -3408,7 +3408,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, run_pending = 1; } } else if (event == NETDEV_CHANGE) { - if (!addrconf_qdisc_ok(dev)) { + if (!addrconf_link_ready(dev)) { /* device is still not ready. */ break; } -- 2.1.4
Re: [PATCH] net: ipv6: fix regression of no RTM_DELADDR sent after DAD failure
Hi, In the absence of a reply from Mahesh, I would be most grateful for anyone familiar with the IPv6 code to review this 1-line fix. Or if not, then I request that the commit f784ad3d79e5 is backed out, as its intention is to remove the redundant but harmless RTM_DELADDR for addresses in tentative state, but is also incorrectly removing the very necessary RTM_DELADDR when an address is deleted that was previously notified with an RTM_NEWADDR as being in tentative dadfailed state. Thanks Mike On 08/09/17 03:18, David Miller wrote: > From: Mike Manning <mmann...@brocade.com> > Date: Mon, 4 Sep 2017 15:52:55 +0100 > >> Commit f784ad3d79e5 ("ipv6: do not send RTM_DELADDR for tentative >> addresses") incorrectly assumes that no RTM_NEWADDR are sent for >> addresses in tentative state, as this does happen for the standard >> IPv6 use-case of DAD failure, see the call to ipv6_ifa_notify() in >> addconf_dad_stop(). So as a result of this change, no RTM_DELADDR is >> sent after DAD failure for a link-local when strict DAD (accept_dad=2) >> is configured, or on the next admin down in other cases. The absence >> of this notification breaks backwards compatibility and causes problems >> after DAD failure if this notification was being relied on. The >> solution is to allow RTM_DELADDR to still be sent after DAD failure. >> >> Fixes: f784ad3d79e5("ipv6: do not send RTM_DELADDR for tentative addresses") >> Signed-off-by: Mike Manning <mmann...@brocade.com> >> Cc: Mahesh Bandewar <mahe...@google.com> > > Mahesh, please review this patch. >
[PATCH] net: ipv6: fix regression of no RTM_DELADDR sent after DAD failure
Commit f784ad3d79e5 ("ipv6: do not send RTM_DELADDR for tentative addresses") incorrectly assumes that no RTM_NEWADDR are sent for addresses in tentative state, as this does happen for the standard IPv6 use-case of DAD failure, see the call to ipv6_ifa_notify() in addconf_dad_stop(). So as a result of this change, no RTM_DELADDR is sent after DAD failure for a link-local when strict DAD (accept_dad=2) is configured, or on the next admin down in other cases. The absence of this notification breaks backwards compatibility and causes problems after DAD failure if this notification was being relied on. The solution is to allow RTM_DELADDR to still be sent after DAD failure. Fixes: f784ad3d79e5("ipv6: do not send RTM_DELADDR for tentative addresses") Signed-off-by: Mike Manning <mmann...@brocade.com> Cc: Mahesh Bandewar <mahe...@google.com> --- net/ipv6/addrconf.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 936e9ab..ba757c2 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4982,9 +4982,10 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) /* Don't send DELADDR notification for TENTATIVE address, * since NEWADDR notification is sent only after removing -* TENTATIVE flag. +* TENTATIVE flag, if DAD has not failed. */ - if (ifa->flags & IFA_F_TENTATIVE && event == RTM_DELADDR) + if (ifa->flags & IFA_F_TENTATIVE && !(ifa->flags & IFA_F_DADFAILED) && + event == RTM_DELADDR) return; skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC); -- 2.1.4
Re: [PATCH] net: ipv6: Fix warning of freeing alive inet6 address
On 03/05/17 19:24, Mike Manning wrote: > On 03/05/17 18:58, Cong Wang wrote: >> On Tue, May 2, 2017 at 11:30 AM, Mike Manning <mmann...@brocade.com> wrote: >>> While this is not reproducible manually, Andrey's syzkaller program hit >>> the warning "IPv6: Freeing alive inet6 address" with this part trace: >>> >>> inet6_ifa_finish_destroy+0x12e/0x190 c:894 >>> in6_ifa_put ./include/net/addrconf.h:330 >>> addrconf_dad_work+0x4e9/0x1040 net/ipv6/addrconf.c:3963 >>> >>> The fix is to call in6_ifa_put() for the inet6_ifaddr before rather >>> than after calling addrconf_ifdown(), as the latter may remove it from >>> the address hash table. >>> >>> Fixes: 85b51b12115c ("net: ipv6: Remove addresses for failures with strict >>> DAD") >>> Reported-by: Andrey Konovalov <andreyk...@google.com> >>> Signed-off-by: Mike Manning <mmann...@brocade.com> >>> --- >>> net/ipv6/addrconf.c | 6 +- >>> 1 file changed, 5 insertions(+), 1 deletion(-) >>> >>> diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c >>> index 80ce478..361993a 100644 >>> --- a/net/ipv6/addrconf.c >>> +++ b/net/ipv6/addrconf.c >>> @@ -3902,8 +3902,11 @@ static void addrconf_dad_work(struct work_struct *w) >>> } else if (action == DAD_ABORT) { >>> in6_ifa_hold(ifp); >>> addrconf_dad_stop(ifp, 1); >>> - if (disable_ipv6) >>> + if (disable_ipv6) { >>> + in6_ifa_put(ifp); >>> addrconf_ifdown(idev->dev, 0); >>> + goto unlock; >>> + } >> >> >> But addrconf_dad_stop() calls ipv6_del_addr() which could unhash >> the addr too... >> Further investigation shows that none of the code block above is at fault. Debugging shows that the problem is happening with DAD_BEGIN and not DAD_ABORT. Follows more detail on the issue, but as I do not have a fix at this stage, I retract this submission altogether. The problem is due to rapidly adding the same address fd00::bb on ip6tnl0, and also without running DAD (accept_dad < 1), so it's an edge case. Typically the call to addrconf_dad_work() starts with an ifp refcnt of 3. Then via addrconf_dad_begin() and addrconf_dad_completed(), the call to addrconf_del_dad_work() results in a dec of the refcnt to 2 due to the call to cancel_delayed_work() returning 1. The 2nd normal case is if the call to addrconf_dad_work() starts with an ifp refcnt of 2, in which case the call to cancel_delayed_work() returns 0 and so no decrement of the refcnt, which correctly stays at 2. The error case is when the call to addrconf_dad_work() starts with an ifp refcnt of 2, but the call to cancel_delayed_work() then also results in a dec of the refcnt to 1, so the final in6_ifa_put() detects that the refcnt is being reduced to 0 for an active address. So the question is whether the interaction of cancel_delayed_work() in addrconf_dad_work(), delayed_work_pending() in addrconf_mod_dad_work() and INIT_DELAYED_WORK in ipv6_add_addr() [along with the handling for this when deleting addresses] needs improving, and if so how?
Re: [PATCH] net: ipv6: Fix warning of freeing alive inet6 address
On 03/05/17 18:58, Cong Wang wrote: > On Tue, May 2, 2017 at 11:30 AM, Mike Manning <mmann...@brocade.com> wrote: >> While this is not reproducible manually, Andrey's syzkaller program hit >> the warning "IPv6: Freeing alive inet6 address" with this part trace: >> >> inet6_ifa_finish_destroy+0x12e/0x190 c:894 >> in6_ifa_put ./include/net/addrconf.h:330 >> addrconf_dad_work+0x4e9/0x1040 net/ipv6/addrconf.c:3963 >> >> The fix is to call in6_ifa_put() for the inet6_ifaddr before rather >> than after calling addrconf_ifdown(), as the latter may remove it from >> the address hash table. >> >> Fixes: 85b51b12115c ("net: ipv6: Remove addresses for failures with strict >> DAD") >> Reported-by: Andrey Konovalov <andreyk...@google.com> >> Signed-off-by: Mike Manning <mmann...@brocade.com> >> --- >> net/ipv6/addrconf.c | 6 +- >> 1 file changed, 5 insertions(+), 1 deletion(-) >> >> diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c >> index 80ce478..361993a 100644 >> --- a/net/ipv6/addrconf.c >> +++ b/net/ipv6/addrconf.c >> @@ -3902,8 +3902,11 @@ static void addrconf_dad_work(struct work_struct *w) >> } else if (action == DAD_ABORT) { >> in6_ifa_hold(ifp); >> addrconf_dad_stop(ifp, 1); >> - if (disable_ipv6) >> + if (disable_ipv6) { >> + in6_ifa_put(ifp); >> addrconf_ifdown(idev->dev, 0); >> + goto unlock; >> + } > > > But addrconf_dad_stop() calls ipv6_del_addr() which could unhash > the addr too... > Agreed, and in the mean time Andrey has confirmed that this v1 patch does not resolve the issue. The problem is not specific to my change for removing addresses. It seems that generally here the in6_ifa_hold() and matching in6_ifa_put() are surplus to requirement, as the address refcnt is 2 even without the hold before calling DAD stop.
Re: [PATCH] net: ipv6: Fix warning of freeing alive inet6 address
On reflection, please put this on hold subject to testing with syzkaller. I have not had a repro of the issue and so the fix even though harmless may not be effective. Thanks Mike On 02/05/17 19:30, Mike Manning wrote: > While this is not reproducible manually, Andrey's syzkaller program hit > the warning "IPv6: Freeing alive inet6 address" with this part trace: > > inet6_ifa_finish_destroy+0x12e/0x190 c:894 > in6_ifa_put ./include/net/addrconf.h:330 > addrconf_dad_work+0x4e9/0x1040 net/ipv6/addrconf.c:3963 > > The fix is to call in6_ifa_put() for the inet6_ifaddr before rather > than after calling addrconf_ifdown(), as the latter may remove it from > the address hash table. > > Fixes: 85b51b12115c ("net: ipv6: Remove addresses for failures with strict > DAD") > Reported-by: Andrey Konovalov <andreyk...@google.com> > Signed-off-by: Mike Manning <mmann...@brocade.com> > --- > net/ipv6/addrconf.c | 6 +- > 1 file changed, 5 insertions(+), 1 deletion(-) > > diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c > index 80ce478..361993a 100644 > --- a/net/ipv6/addrconf.c > +++ b/net/ipv6/addrconf.c > @@ -3902,8 +3902,11 @@ static void addrconf_dad_work(struct work_struct *w) > } else if (action == DAD_ABORT) { > in6_ifa_hold(ifp); > addrconf_dad_stop(ifp, 1); > - if (disable_ipv6) > + if (disable_ipv6) { > + in6_ifa_put(ifp); > addrconf_ifdown(idev->dev, 0); > + goto unlock; > + } > goto out; > } > > @@ -3950,6 +3953,7 @@ static void addrconf_dad_work(struct work_struct *w) > ifp->dad_nonce); > out: > in6_ifa_put(ifp); > +unlock: > rtnl_unlock(); > } > >
[PATCH] net: ipv6: Fix warning of freeing alive inet6 address
While this is not reproducible manually, Andrey's syzkaller program hit the warning "IPv6: Freeing alive inet6 address" with this part trace: inet6_ifa_finish_destroy+0x12e/0x190 c:894 in6_ifa_put ./include/net/addrconf.h:330 addrconf_dad_work+0x4e9/0x1040 net/ipv6/addrconf.c:3963 The fix is to call in6_ifa_put() for the inet6_ifaddr before rather than after calling addrconf_ifdown(), as the latter may remove it from the address hash table. Fixes: 85b51b12115c ("net: ipv6: Remove addresses for failures with strict DAD") Reported-by: Andrey Konovalov <andreyk...@google.com> Signed-off-by: Mike Manning <mmann...@brocade.com> --- net/ipv6/addrconf.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 80ce478..361993a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3902,8 +3902,11 @@ static void addrconf_dad_work(struct work_struct *w) } else if (action == DAD_ABORT) { in6_ifa_hold(ifp); addrconf_dad_stop(ifp, 1); - if (disable_ipv6) + if (disable_ipv6) { + in6_ifa_put(ifp); addrconf_ifdown(idev->dev, 0); + goto unlock; + } goto out; } @@ -3950,6 +3953,7 @@ static void addrconf_dad_work(struct work_struct *w) ifp->dad_nonce); out: in6_ifa_put(ifp); +unlock: rtnl_unlock(); } -- 2.1.4
Re: net/ipv6: warning in inet6_ifa_finish_destroy
On 28/04/17 21:39, Cong Wang wrote: > On Fri, Apr 28, 2017 at 6:08 AM, Andrey Konovalov> wrote: >> Hi, >> >> I've got the following error report while fuzzing the kernel with syzkaller. >> >> On commit 5a7ad1146caa895ad718a534399e38bd2ba721b7 (4.11-rc8). >> >> C reproducer and .config are attached. >> It takes 1-2 minutes of running the reproducer to trigger the issue. >> >> [ cut here ] >> WARNING: CPU: 0 PID: 21 at net/ipv6/addrconf.c:894 >> inet6_ifa_finish_destroy+0x12e/0x190 >> Modules linked in: >> CPU: 0 PID: 21 Comm: kworker/0:1 Not tainted 4.11.0-rc8+ #296 >> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 >> Workqueue: ipv6_addrconf addrconf_dad_work >> Call Trace: >> __dump_stack lib/dump_stack.c:16 >> dump_stack+0x292/0x398 lib/dump_stack.c:52 >> __warn+0x19f/0x1e0 kernel/panic.c:549 >> warn_slowpath_null+0x2c/0x40 kernel/panic.c:584 >> inet6_ifa_finish_destroy+0x12e/0x190 c:894 >> in6_ifa_put ./include/net/addrconf.h:330 >> addrconf_dad_work+0x4e9/0x1040 net/ipv6/addrconf.c:3963 > > > I don't look too much, but a quick glance shows in the following > path: > > } else if (action == DAD_ABORT) { > in6_ifa_hold(ifp); > addrconf_dad_stop(ifp, 1); > if (disable_ipv6) > addrconf_ifdown(idev->dev, 0); > goto out; > } > > the inet6_addr could be removed from hash table in > addrconf_ifdown() before calling in6_ifa_put(). which causes > this warning. > git describe 85b51b12115c79cce7ea1ced6c0bd0339a165d3f --contains v4.8-rc5~34^2~32 This fix was introduced in 4.8, so it is interesting that this problem is only showing up now for 4.11. Also, it is not reproducible manually, i.e. DAD failure with disable_ipv6 works just fine without triggering this warning, with or without keeping IPv6 addresses on admin down. I will go ahead with putting out a fix so that in6_ifa_put() precedes the call to addrconf_ifdown() in this case. Thanks for the heads up on this, Mike
[PATCH v2 net-next] bridge: add per-port broadcast flood flag
Support for l2 multicast flood control was added in commit b6cb5ac8331b ("net: bridge: add per-port multicast flood flag"). It allows broadcast as it was introduced specifically for unknown multicast flood control. But as broadcast is a special case of multicast, this may also need to be disabled. For this purpose, introduce a flag to disable the flooding of received l2 broadcasts. This approach is backwards compatible and provides flexibility in filtering for the desired packet types. Cc: Nikolay Aleksandrov <niko...@cumulusnetworks.com> Signed-off-by: Mike Manning <mmann...@brocade.com> --- include/linux/if_bridge.h| 1 + include/uapi/linux/if_link.h | 1 + net/bridge/br_forward.c | 24 +--- net/bridge/br_if.c | 2 +- net/bridge/br_netlink.c | 3 +++ net/bridge/br_sysfs_if.c | 2 ++ 6 files changed, 25 insertions(+), 8 deletions(-) diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index c5847dc..0c16866 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -48,6 +48,7 @@ struct br_ip_list { #define BR_MCAST_FLOOD BIT(11) #define BR_MULTICAST_TO_UNICASTBIT(12) #define BR_VLAN_TUNNEL BIT(13) +#define BR_BCAST_FLOOD BIT(14) #define BR_DEFAULT_AGEING_TIME (300 * HZ) diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 633aa02..8e56ac7 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -323,6 +323,7 @@ enum { IFLA_BRPORT_MCAST_FLOOD, IFLA_BRPORT_MCAST_TO_UCAST, IFLA_BRPORT_VLAN_TUNNEL, + IFLA_BRPORT_BCAST_FLOOD, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 902af6b..48fb174 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -183,13 +183,23 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb, struct net_bridge_port *p; list_for_each_entry_rcu(p, >port_list, list) { - /* Do not flood unicast traffic to ports that turn it off */ - if (pkt_type == BR_PKT_UNICAST && !(p->flags & BR_FLOOD)) - continue; - /* Do not flood if mc off, except for traffic we originate */ - if (pkt_type == BR_PKT_MULTICAST && - !(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev) - continue; + /* Do not flood unicast traffic to ports that turn it off, nor +* other traffic if flood off, except for traffic we originate +*/ + switch (pkt_type) { + case BR_PKT_UNICAST: + if (!(p->flags & BR_FLOOD)) + continue; + break; + case BR_PKT_MULTICAST: + if (!(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev) + continue; + break; + case BR_PKT_BROADCAST: + if (!(p->flags & BR_BCAST_FLOOD) && skb->dev != br->dev) + continue; + break; + } /* Do not flood to ports that enable proxy ARP */ if (p->flags & BR_PROXYARP) diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 6d273ca..b436ea0 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -362,7 +362,7 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br, p->path_cost = port_cost(dev); p->priority = 0x8000 >> BR_PORT_BITS; p->port_no = index; - p->flags = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD; + p->flags = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD; br_init_port(p); br_set_state(p, BR_STATE_DISABLED); br_stp_port_timer_init(p); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 6509864..a572db71 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -189,6 +189,8 @@ static int br_port_fill_attrs(struct sk_buff *skb, !!(p->flags & BR_FLOOD)) || nla_put_u8(skb, IFLA_BRPORT_MCAST_FLOOD, !!(p->flags & BR_MCAST_FLOOD)) || + nla_put_u8(skb, IFLA_BRPORT_BCAST_FLOOD, + !!(p->flags & BR_BCAST_FLOOD)) || nla_put_u8(skb, IFLA_BRPORT_PROXYARP, !!(p->flags & BR_PROXYARP)) || nla_put_u8(skb, IFLA_BRPORT_PROXYARP_WIFI, !!(p->flags & BR_PROXYARP_WIFI)) || @@ -683,6 +685,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD); br
Re: [PATCH] net: bridge: suppress broadcast when multicast flood is disabled
On 24/04/17 20:52, Nikolay Aleksandrov wrote: > On 24/04/17 17:09, Mike Manning wrote: >> Flood suppression for packets that are not unicast needs to be handled >> consistently by also not flooding broadcast packets. As broadcast is a >> special case of multicast, the same kernel parameter should be used to >> suppress flooding for both of these packet types. >> >> Fixes: b6cb5ac8331b ("net: bridge: add per-port multicast flood flag") >> Cc: Nikolay Aleksandrov <niko...@cumulusnetworks.com> >> Signed-off-by: Mike Manning <mmann...@brocade.com> >> --- >> net/bridge/br_forward.c | 17 ++--- >> 1 file changed, 10 insertions(+), 7 deletions(-) >> > > I do not agree that this is a bug fix, the behaviour was intentional and is > close to how HW > handles this flag. It has been like that for a few releases and changing it > may impact setups > that use the flag since up until now they've seen the broadcast but not > multicast packets and > suddenly their broadcast will stop. > > I think it would be better to introduce a third flag for bcast in net-next > and use that to > filter it since that would give us the ability to program HW that can > distinguish these > and have both options available, moreover it will not break any user setups > relying on > the current flag behaviour and we have such setups. > > Thanks, > Nik > > Hi Nik, What is the usecase for flooding broadcast but not multicast please? Is the lack of flood suppression for broadcast just something that has not been explicitly tested for in those setups? This is the case for us, the bug raised only at this stage of the release cycle. While adding another kernel param is an option, I would only do so if absolutely necessary so as to avoid proliferation of params. Also to justify adding such a flag for broadcast suppression, I would need to add a comment to explain that while broadcast is a subset of multicast, the multicast flood suppression flag excludes broadcast. Thanks Mike
[PATCH] net: bridge: suppress broadcast when multicast flood is disabled
Flood suppression for packets that are not unicast needs to be handled consistently by also not flooding broadcast packets. As broadcast is a special case of multicast, the same kernel parameter should be used to suppress flooding for both of these packet types. Fixes: b6cb5ac8331b ("net: bridge: add per-port multicast flood flag") Cc: Nikolay Aleksandrov <niko...@cumulusnetworks.com> Signed-off-by: Mike Manning <mmann...@brocade.com> --- net/bridge/br_forward.c | 17 ++--- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 902af6b..a61c7ad 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -183,13 +183,16 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb, struct net_bridge_port *p; list_for_each_entry_rcu(p, >port_list, list) { - /* Do not flood unicast traffic to ports that turn it off */ - if (pkt_type == BR_PKT_UNICAST && !(p->flags & BR_FLOOD)) - continue; - /* Do not flood if mc off, except for traffic we originate */ - if (pkt_type == BR_PKT_MULTICAST && - !(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev) - continue; + /* Do not flood unicast traffic to ports that turn it off, nor +* other traffic if mc flood off except for traffic we originate +*/ + if (pkt_type == BR_PKT_UNICAST) { + if (!(p->flags & BR_FLOOD)) + continue; + } else { + if (!(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev) + continue; + } /* Do not flood to ports that enable proxy ARP */ if (p->flags & BR_PROXYARP) -- 2.1.4
Re: [PATCH] net: bridge: allow IPv6 when multicast flood is disabled
On 28/02/17 09:20, Nikolay Aleksandrov wrote: > We are aware of this and have discussed it, but I’m not sure this is the best > way to fix it, > it will still allow local IPv4 mcast to be flooded on all ports even with > that flag removed and > that definitely changes user-visible behaviour (even if it is okay) and will > not be appropriate > for -net. > > Let me get back to you on this one. > > Thanks, > Nik > Thanks for your comments, I have sent a v2 patch accordingly in case you have no better suggestion. We need per-port disabling of multicast flooding, but have to apply this patch to allow IPv6 connectivity so as to make it usable. There is no noteworthy impact on IPv4 as the fix only allows packets originated by the device. As this feature is new to the 4.9 kernel, there are no backwards compatibility issues with prior kernel versions if this fix is also applied to the 4.9 kernel.
[PATCH v2] net: bridge: allow IPv6 when multicast flood is disabled
Even with multicast flooding turned off, IPv6 ND should still work so that IPv6 connectivity is provided. Allow this by continuing to flood multicast traffic originated by us. Fixes: b6cb5ac8331b ("net: bridge: add per-port multicast flood flag") Cc: Nikolay Aleksandrov <niko...@cumulusnetworks.com> Signed-off-by: Mike Manning <mmann...@brocade.com> --- net/bridge/br_forward.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 6bfac29..902af6b 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -186,8 +186,9 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb, /* Do not flood unicast traffic to ports that turn it off */ if (pkt_type == BR_PKT_UNICAST && !(p->flags & BR_FLOOD)) continue; + /* Do not flood if mc off, except for traffic we originate */ if (pkt_type == BR_PKT_MULTICAST && - !(p->flags & BR_MCAST_FLOOD)) + !(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev) continue; /* Do not flood to ports that enable proxy ARP */ -- 2.1.4
[PATCH] net: bridge: allow IPv6 when multicast flood is disabled
Even with multicast flooding turned off, IPv6 ND should still work so that IPv6 connectivity is provided. Allow this by continuing to flood multicast traffic originated by us. And similar to the unicast case, set auto-mask if the multicast flood flag is set. Fixes: b6cb5ac8331b ("net: bridge: add per-port multicast flood flag") Cc: Nikolay Aleksandrov <niko...@cumulusnetworks.com> Signed-off-by: Mike Manning <mmann...@brocade.com> --- include/linux/if_bridge.h | 2 +- net/bridge/br_forward.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index c5847dc..7731808 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -40,12 +40,12 @@ struct br_ip_list { #define BR_ADMIN_COST BIT(4) #define BR_LEARNINGBIT(5) #define BR_FLOOD BIT(6) -#define BR_AUTO_MASK (BR_FLOOD | BR_LEARNING) #define BR_PROMISC BIT(7) #define BR_PROXYARPBIT(8) #define BR_LEARNING_SYNC BIT(9) #define BR_PROXYARP_WIFI BIT(10) #define BR_MCAST_FLOOD BIT(11) +#define BR_AUTO_MASK (BR_FLOOD | BR_LEARNING | BR_MCAST_FLOOD) #define BR_MULTICAST_TO_UNICASTBIT(12) #define BR_VLAN_TUNNEL BIT(13) diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 6bfac29..7fe7d58 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -186,8 +186,9 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb, /* Do not flood unicast traffic to ports that turn it off */ if (pkt_type == BR_PKT_UNICAST && !(p->flags & BR_FLOOD)) continue; + /* Do not flood if mc off, except for traffic we originate */ if (pkt_type == BR_PKT_MULTICAST && - !(p->flags & BR_MCAST_FLOOD)) + !(p->flags & BR_MCAST_FLOOD) && (skb->dev != br->dev)) continue; /* Do not flood to ports that enable proxy ARP */ -- 2.1.4
Re: [PATCH] net: ipv6: avoid errors due to per-cpu atomic alloc
On 11/22/2016 12:18 PM, Hannes Frederic Sowa wrote: > On 22.11.2016 11:34, Mike Manning wrote: >> Bursts of failures may occur when adding IPv6 routes via Netlink to the >> kernel when testing under scale (e.g. 500 routes lost out of 1M). The >> reason is that percpu.c:pcpu_balance_workfn() is not guaranteed to have >> extended the area map in time for the atomic allocation using percpu.c: >> pcpu_alloc() to succeed. This results in route additions failing with >> an -ENOMEM error. >> >> While the sender of the Netlink msg to add this route could check for >> an ACK and retransmit in the case of an -ENOMEM error, the latter >> should not occur in the first place if there is plenty of memory. The >> solution is to use non-atomic alloc for rt6_info instead. While the >> client may now be blocked for longer depending on the state of the >> chunk being added to, this work has to be incurred at some point. >> >> The alternative solution would be to provide configurable parameters >> e.g. via sysctl in percpu.c for default map size, low/high empty pages >> and map margins. For this solution, the map margin sizes need to be >> stored per chunk, as large margins cannot be used if the dynamic early >> slots map size is in use. This is not a preferred solution though, as >> it requires tuning of these parameters to provide sufficient margins to >> avoid -ENOMEM errors depending on system requirements. >> >> Signed-off-by: Mike Manning <mmann...@brocade.com> >> --- >> net/ipv6/route.c |2 +- >> 1 file changed, 1 insertion(+), 1 deletion(-) >> >> diff --git a/net/ipv6/route.c b/net/ipv6/route.c >> index 1b57e11..0e9bb76 100644 >> --- a/net/ipv6/route.c >> +++ b/net/ipv6/route.c >> @@ -347,7 +347,7 @@ struct rt6_info *ip6_dst_alloc(struct net *net, >> struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags); >> >> if (rt) { >> -rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC); >> +rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_KERNEL); >> if (rt->rt6i_pcpu) { >> int cpu; > > Nak, this doesn't work, as ip6_dst_alloc must be callable from > non-blocking code paths unfortunately. > > Thanks for the prompt reply. Do you consider the alternative of providing configurable parameters for per-cpu alloc as viable, or is there a better way of dealing with this? While I have tested such param changes under scale as avoiding the -ENOMEM errors, it would be good to get confirmation that this approach is acceptable prior to coding the sysctl handling for these.
[PATCH] net: ipv6: avoid errors due to per-cpu atomic alloc
Bursts of failures may occur when adding IPv6 routes via Netlink to the kernel when testing under scale (e.g. 500 routes lost out of 1M). The reason is that percpu.c:pcpu_balance_workfn() is not guaranteed to have extended the area map in time for the atomic allocation using percpu.c: pcpu_alloc() to succeed. This results in route additions failing with an -ENOMEM error. While the sender of the Netlink msg to add this route could check for an ACK and retransmit in the case of an -ENOMEM error, the latter should not occur in the first place if there is plenty of memory. The solution is to use non-atomic alloc for rt6_info instead. While the client may now be blocked for longer depending on the state of the chunk being added to, this work has to be incurred at some point. The alternative solution would be to provide configurable parameters e.g. via sysctl in percpu.c for default map size, low/high empty pages and map margins. For this solution, the map margin sizes need to be stored per chunk, as large margins cannot be used if the dynamic early slots map size is in use. This is not a preferred solution though, as it requires tuning of these parameters to provide sufficient margins to avoid -ENOMEM errors depending on system requirements. Signed-off-by: Mike Manning <mmann...@brocade.com> --- net/ipv6/route.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 1b57e11..0e9bb76 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -347,7 +347,7 @@ struct rt6_info *ip6_dst_alloc(struct net *net, struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags); if (rt) { - rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC); + rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_KERNEL); if (rt->rt6i_pcpu) { int cpu; -- 1.7.10.4
Re: [PATCH] net: ipv6: Disable forwarding per interface via sysctl
On 09/16/2016 04:46 PM, Hannes Frederic Sowa wrote: > On 16.09.2016 15:39, Eric Dumazet wrote: >> On Fri, 2016-09-16 at 13:47 +0100, Mike Manning wrote: >>> Disabling forwarding per interface via sysctl continues to allow >>> forwarding. This is contrary to the sysctl documentation stating that >>> the forwarding sysctl is per interface, whereas currently it is only >>> the sysctl for all interfaces that has an effect on forwarding. The >>> solution is to drop any received packets instead of forwarding them >>> if the ingress device has a per-device forwarding sysctl that is unset. >> >> Some archaeological research might be needed because >> Documentation/networking/ip-sysctl.txt states : >> >> IPv4 and IPv6 work differently here; e.g. netfilter must be used >> to control which interfaces may forward packets and which not. >> >> If this netfilter requirement is obsolete, then your patch would need to >> change the doc as well. >> >> Hannes can probably comment on this ? > > Yep, thanks. > > This commit breaks a very common setup: people globally enabled > forwarding but disabled the forwarding knob on one special interface to > allow this interface to participate in auto configuration from their > provider while still forwarding packets over this interface. > > I fear this is so common that this would be a uapi violation. > > Thanks, > Hannes > > Thanks for the use-case, I request to withdraw this patch then. So configuring an interface on a router to be in host mode is not actually disabling forwarding in the kernel, it is merely to allow SLAAC. Using ip6tables for the purpose of disabling forwarding on an interface if one wants an interface in host mode seems a heavyweight solution to work around this. If anyone has any better suggestions, please let me know.
[PATCH] net: ipv6: Disable forwarding per interface via sysctl
Disabling forwarding per interface via sysctl continues to allow forwarding. This is contrary to the sysctl documentation stating that the forwarding sysctl is per interface, whereas currently it is only the sysctl for all interfaces that has an effect on forwarding. The solution is to drop any received packets instead of forwarding them if the ingress device has a per-device forwarding sysctl that is unset. Signed-off-by: Mike Manning <mmann...@brocade.com> --- net/ipv6/ip6_output.c |4 1 file changed, 4 insertions(+) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 1dfc402..37cd1d0 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -380,11 +380,15 @@ int ip6_forward(struct sk_buff *skb) struct ipv6hdr *hdr = ipv6_hdr(skb); struct inet6_skb_parm *opt = IP6CB(skb); struct net *net = dev_net(dst->dev); + struct inet6_dev *idev = __in6_dev_get(skb->dev); u32 mtu; if (net->ipv6.devconf_all->forwarding == 0) goto error; + if (idev && !idev->cnf.forwarding) + goto error; + if (skb->pkt_type != PACKET_HOST) goto drop; -- 1.7.10.4
[PATCH] net: ipv6: Failure to disable forwarding per interface via sysctl
Disabling forwarding per interface via sysctl continues to allow forwarding. This is contrary to the sysctl documentation stating that the forwarding sysctl is per interface, whereas currently it is only the sysctl for all interfaces that has an effect on forwarding. The solution is to drop any received packets instead of forwarding them if the ingress device has a per-device forwarding sysctl that is unset. Signed-off-by: Mike Manning <mmann...@brocade.com> --- net/ipv6/ip6_output.c |4 1 file changed, 4 insertions(+) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 1dfc402..37cd1d0 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -380,11 +380,15 @@ int ip6_forward(struct sk_buff *skb) struct ipv6hdr *hdr = ipv6_hdr(skb); struct inet6_skb_parm *opt = IP6CB(skb); struct net *net = dev_net(dst->dev); + struct inet6_dev *idev = __in6_dev_get(skb->dev); u32 mtu; if (net->ipv6.devconf_all->forwarding == 0) goto error; + if (idev && !idev->cnf.forwarding) + goto error; + if (skb->pkt_type != PACKET_HOST) goto drop; -- 1.7.10.4
[PATCH v2] net: ipv6: Remove addresses for failures with strict DAD
If DAD fails with accept_dad set to 2, global addresses and host routes are incorrectly left in place. Even though disable_ipv6 is set, contrary to documentation, the addresses are not dynamically deleted from the interface. It is only on a subsequent link down/up that these are removed. The fix is not only to set the disable_ipv6 flag, but also to call addrconf_ifdown(), which is the action to carry out when disabling IPv6. This results in the addresses and routes being deleted immediately. The DAD failure for the LL addr is determined as before via netlink, or by the absence of the LL addr (which also previously would have had to be checked for in case of an intervening link down and up). As the call to addrconf_ifdown() requires an rtnl lock, the logic to disable IPv6 when DAD fails is moved to addrconf_dad_work(). Previous behavior: root@vm1:/# sysctl net.ipv6.conf.eth3.accept_dad=2 net.ipv6.conf.eth3.accept_dad = 2 root@vm1:/# ip -6 addr add 2000::10/64 dev eth3 root@vm1:/# ip link set up eth3 root@vm1:/# ip -6 addr show dev eth3 5: eth3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qlen 1000 inet6 2000::10/64 scope global valid_lft forever preferred_lft forever inet6 fe80::5054:ff:fe43:dd5a/64 scope link tentative dadfailed valid_lft forever preferred_lft forever root@vm1:/# ip -6 route show dev eth3 2000::/64 proto kernel metric 256 fe80::/64 proto kernel metric 256 root@vm1:/# ip link set down eth3 root@vm1:/# ip link set up eth3 root@vm1:/# ip -6 addr show dev eth3 root@vm1:/# ip -6 route show dev eth3 root@vm1:/# New behavior: root@vm1:/# sysctl net.ipv6.conf.eth3.accept_dad=2 net.ipv6.conf.eth3.accept_dad = 2 root@vm1:/# ip -6 addr add 2000::10/64 dev eth3 root@vm1:/# ip link set up eth3 root@vm1:/# ip -6 addr show dev eth3 root@vm1:/# ip -6 route show dev eth3 root@vm1:/# Signed-off-by: Mike Manning <mmann...@brocade.com> --- net/ipv6/addrconf.c | 34 +- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index df8425f..f418d2e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1872,7 +1872,6 @@ static int addrconf_dad_end(struct inet6_ifaddr *ifp) void addrconf_dad_failure(struct inet6_ifaddr *ifp) { - struct in6_addr addr; struct inet6_dev *idev = ifp->idev; struct net *net = dev_net(ifp->idev->dev); @@ -1934,18 +1933,6 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp) in6_ifa_put(ifp2); lock_errdad: spin_lock_bh(>lock); - } else if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6) { - addr.s6_addr32[0] = htonl(0xfe80); - addr.s6_addr32[1] = 0; - - if (!ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) && - ipv6_addr_equal(>addr, )) { - /* DAD failed for link-local based on MAC address */ - idev->cnf.disable_ipv6 = 1; - - pr_info("%s: IPv6 being disabled!\n", - ifp->idev->dev->name); - } } errdad: @@ -3821,6 +3808,7 @@ static void addrconf_dad_work(struct work_struct *w) dad_work); struct inet6_dev *idev = ifp->idev; struct in6_addr mcaddr; + bool disable_ipv6 = false; enum { DAD_PROCESS, @@ -3837,6 +3825,24 @@ static void addrconf_dad_work(struct work_struct *w) } else if (ifp->state == INET6_IFADDR_STATE_ERRDAD) { action = DAD_ABORT; ifp->state = INET6_IFADDR_STATE_POSTDAD; + + if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6 && + !(ifp->flags & IFA_F_STABLE_PRIVACY)) { + struct in6_addr addr; + + addr.s6_addr32[0] = htonl(0xfe80); + addr.s6_addr32[1] = 0; + + if (!ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) && + ipv6_addr_equal(>addr, )) { + /* DAD failed for link-local based on MAC */ + idev->cnf.disable_ipv6 = 1; + + pr_info("%s: IPv6 being disabled!\n", + ifp->idev->dev->name); + disable_ipv6 = true; + } + } } spin_unlock_bh(>lock); @@ -3845,6 +3851,8 @@ static void addrconf_dad_work(struct work_struct *w) goto out; } else if (action == DAD_ABORT) { addrconf_dad_stop(ifp, 1); + if (disable_ipv6) + addrconf_ifdown(idev->dev, 0); goto out; } -- 1.7.10.4
Re: [PATCH] net: ipv6: Remove addresses for failures with strict DAD
On 08/17/2016 04:40 PM, Hannes Frederic Sowa wrote: > On 17.08.2016 12:28, Mike Manning wrote: >> +static void dev_disable_change(struct inet6_dev *idev); >> >> /* >> * Configured unicast address hash table >> @@ -1945,6 +1946,12 @@ lock_errdad: >> >> pr_info("%s: IPv6 being disabled!\n", >> ifp->idev->dev->name); >> +spin_unlock_bh(>lock); >> +addrconf_dad_stop(ifp, 1); >> +rtnl_lock(); >> +dev_disable_change(idev); >> +rtnl_unlock(); >> +return; >> } >> } > > You can't take rtnl_lock at that point but must postpone the actions and > do that in addrconf_dad_work. > > Probably the whole ... else if (idev->cnf.accept_dad > 1 && ...) needs > to move there. > > Bye, > Hannes > > Thanks for the prompt review, I will look into making these changes. Also these changes caused a build error due to conditional compilation without CONFIG_SYSCTL, which is resolved by replacing the call to dev_disable_change(idev) by directly calling addrconf_ifdown(idev->dev, 0) instead. I would appreciate any further comments if the suggested change in behavior is not acceptable. Thanks Mike
[PATCH] net: ipv6: Remove addresses for failures with strict DAD
If DAD fails with accept_dad set to 2, global addresses and host routes are incorrectly left in place. Even though disable_ipv6 is set, contrary to documentation, the addresses are not dynamically deleted from the interface. It is only on a subsequent link down/up that these are removed. The fix is not only to set the disable_ipv6 flag, but to actually disable IPv6 when DAD fails in this case. This results in the addresses and routes being immediately deleted. The DAD failure for the LL addr is determined as before via netlink, or by the absence of the LL addr (which also previously would have had to be checked for in case of an intervening link down/up). Previous behavior: root@vm1:/# sysctl net.ipv6.conf.eth3.accept_dad=2 net.ipv6.conf.eth3.accept_dad = 2 root@vm1:/# ip -6 addr add 2000::10/64 dev eth3 root@vm1:/# ip link set up eth3 root@vm1:/# ip -6 addr show dev eth3 5: eth3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qlen 1000 inet6 2000::10/64 scope global valid_lft forever preferred_lft forever inet6 fe80::5054:ff:fe43:dd5a/64 scope link tentative dadfailed valid_lft forever preferred_lft forever root@vm1:/# ip -6 route show dev eth3 2000::/64 proto kernel metric 256 fe80::/64 proto kernel metric 256 root@vm1:/# ip link set down eth3 root@vm1:/# ip link set up eth3 root@vm1:/# ip -6 addr show dev eth3 root@vm1:/# ip -6 route show dev eth3 root@vm1:/# New behavior: root@vm1:/# sysctl net.ipv6.conf.eth3.accept_dad=2 net.ipv6.conf.eth3.accept_dad = 2 root@vm1:/# ip -6 addr add 2000::10/64 dev eth3 root@vm1:/# ip link set up eth3 root@vm1:/# ip -6 addr show dev eth3 root@vm1:/# ip -6 route show dev eth3 root@vm1:/# Signed-off-by: Mike Manning <mmann...@brocade.com> --- net/ipv6/addrconf.c |7 +++ 1 file changed, 7 insertions(+) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index df8425f..6be5a95 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -135,6 +135,7 @@ static int ipv6_count_addresses(struct inet6_dev *idev); static int ipv6_generate_stable_address(struct in6_addr *addr, u8 dad_count, const struct inet6_dev *idev); +static void dev_disable_change(struct inet6_dev *idev); /* * Configured unicast address hash table @@ -1945,6 +1946,12 @@ lock_errdad: pr_info("%s: IPv6 being disabled!\n", ifp->idev->dev->name); + spin_unlock_bh(>lock); + addrconf_dad_stop(ifp, 1); + rtnl_lock(); + dev_disable_change(idev); + rtnl_unlock(); + return; } } -- 1.7.10.4
[PATCH] net: ipv6: Do not keep IPv6 addresses when IPv6 is disabled
If IPv6 is disabled when the option is set to keep IPv6 addresses on link down, userspace is unaware of this as there is no such indication via netlink. The solution is to remove the IPv6 addresses in this case, which results in netlink messages indicating removal of addresses in the usual manner. This fix also makes the behavior consistent with the case of having IPv6 disabled first, which stops IPv6 addresses from being added. Fixes: f1705ec197e7 ("net: ipv6: Make address flushing on ifdown optional") Signed-off-by: Mike Manning <mmann...@brocade.com> --- net/ipv6/addrconf.c |4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index ab3e796..df8425f 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3543,7 +3543,7 @@ static int addrconf_ifdown(struct net_device *dev, int how) /* combine the user config with event to determine if permanent * addresses are to be removed from address hash table */ - keep_addr = !(how || _keep_addr <= 0); + keep_addr = !(how || _keep_addr <= 0 || idev->cnf.disable_ipv6); /* Step 2: clear hash table */ for (i = 0; i < IN6_ADDR_HSIZE; i++) { @@ -3599,7 +3599,7 @@ restart: /* re-combine the user config with event to determine if permanent * addresses are to be removed from the interface list */ - keep_addr = (!how && _keep_addr > 0); + keep_addr = (!how && _keep_addr > 0 && !idev->cnf.disable_ipv6); INIT_LIST_HEAD(_list); list_for_each_entry_safe(ifa, tmp, >addr_list, if_list) { -- 1.7.10.4
[PATCH v2] net: ipv6: Always leave anycast and multicast groups on link down
Default kernel behavior is to delete IPv6 addresses on link down, which entails deletion of the multicast and the subnet-router anycast addresses. These deletions do not happen with sysctl setting to keep global IPv6 addresses on link down, so every link down/up causes an increment of the anycast and multicast refcounts. These bogus refcounts may stop these addrs from being removed on subsequent calls to delete them. The solution is to leave the groups for the multicast and subnet anycast on link down for the callflow when global IPv6 addresses are kept. Fixes: f1705ec197e7 ("net: ipv6: Make address flushing on ifdown optional") Signed-off-by: Mike Manning <mmann...@brocade.com> --- net/ipv6/addrconf.c |4 1 file changed, 4 insertions(+) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 47f837a..047c75a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3562,6 +3562,10 @@ restart: if (state != INET6_IFADDR_STATE_DEAD) { __ipv6_ifa_notify(RTM_DELADDR, ifa); inet6addr_notifier_call_chain(NETDEV_DOWN, ifa); + } else { + if (idev->cnf.forwarding) + addrconf_leave_anycast(ifa); + addrconf_leave_solict(ifa->idev, >addr); } write_lock_bh(>lock); -- 1.7.10.4
[PATCH] net: ipv6: Always leave subnet anycast group on link down
Default kernel behavior is to delete IPv6 addresses on link down, which entails deletion of the address-derived subnet-router anycast address. The latter does not happen with sysctl setting to keep global IPv6 addrs on link down, so every link down/up causes an increment of the anycast refcount, cf aca_users in __ipv6_dev_ac_inc(). This bogus refcount stops the anycast being removed on subsequent calls to delete the address. The solution is to leave the group for this subnet anycast on link down also for the callflow when global IPv6 addresses are kept. Fixes: f1705ec197e7 ("net: ipv6: Make address flushing on ifdown optional") Signed-off-by: Mike Manning <mmann...@brocade.com> --- net/ipv6/addrconf.c |2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 47f837a..3c69e56 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3562,6 +3562,8 @@ restart: if (state != INET6_IFADDR_STATE_DEAD) { __ipv6_ifa_notify(RTM_DELADDR, ifa); inet6addr_notifier_call_chain(NETDEV_DOWN, ifa); + } else if (idev->cnf.forwarding) { + addrconf_leave_anycast(ifa); } write_lock_bh(>lock); -- 1.7.10.4
[PATCH net v5] vlan: Propagate MAC address to VLANs
The MAC address of the physical interface is only copied to the VLAN when it is first created, resulting in an inconsistency after MAC address changes of only newly created VLANs having an up-to-date MAC. The VLANs should continue inheriting the MAC address of the physical interface until the VLAN MAC address is explicitly set to any value. This allows IPv6 EUI64 addresses for the VLAN to reflect any changes to the MAC of the physical interface and thus for DAD to behave as expected. Signed-off-by: Mike Manning <mmann...@brocade.com> --- net/8021q/vlan.c |5 + net/8021q/vlan.h |2 ++ net/8021q/vlan_dev.c | 20 +--- 3 files changed, 24 insertions(+), 3 deletions(-) --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -291,6 +291,10 @@ static void vlan_sync_address(struct net if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr)) return; + /* vlan continues to inherit address of lower device */ + if (vlan_dev_inherit_address(vlandev, dev)) + goto out; + /* vlan address was different from the old address and is equal to * the new address */ if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && @@ -303,6 +307,7 @@ static void vlan_sync_address(struct net !ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) dev_uc_add(dev, vlandev->dev_addr); +out: ether_addr_copy(vlan->real_dev_addr, dev->dev_addr); } --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -109,6 +109,8 @@ int vlan_check_real_dev(struct net_devic void vlan_setup(struct net_device *dev); int register_vlan_dev(struct net_device *dev); void unregister_vlan_dev(struct net_device *dev, struct list_head *head); +bool vlan_dev_inherit_address(struct net_device *dev, + struct net_device *real_dev); static inline u32 vlan_get_ingress_priority(struct net_device *dev, u16 vlan_tci) --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -244,6 +244,17 @@ void vlan_dev_get_realdev_name(const str strncpy(result, vlan_dev_priv(dev)->real_dev->name, 23); } +bool vlan_dev_inherit_address(struct net_device *dev, + struct net_device *real_dev) +{ + if (dev->addr_assign_type != NET_ADDR_STOLEN) + return false; + + ether_addr_copy(dev->dev_addr, real_dev->dev_addr); + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return true; +} + static int vlan_dev_open(struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); @@ -254,7 +265,8 @@ static int vlan_dev_open(struct net_devi !(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) return -ENETDOWN; - if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr)) { + if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr) && + !vlan_dev_inherit_address(dev, real_dev)) { err = dev_uc_add(real_dev, dev->dev_addr); if (err < 0) goto out; @@ -558,8 +570,10 @@ static int vlan_dev_init(struct net_devi /* ipv6 shared card related stuff */ dev->dev_id = real_dev->dev_id; - if (is_zero_ether_addr(dev->dev_addr)) - eth_hw_addr_inherit(dev, real_dev); + if (is_zero_ether_addr(dev->dev_addr)) { + ether_addr_copy(dev->dev_addr, real_dev->dev_addr); + dev->addr_assign_type = NET_ADDR_STOLEN; + } if (is_zero_ether_addr(dev->broadcast)) memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len); -- 1.7.10.4
Re: [PATCH net v4] vlan: Propagate MAC address to VLANs
On 05/09/2016 07:48 AM, Michal Kubecek wrote: > On Sat, May 07, 2016 at 11:00:09AM +0100, Mike Manning wrote: >> The MAC address of the physical interface is only copied to the VLAN >> when it is first created, resulting in an inconsistency after MAC >> address changes of only newly created VLANs having an up-to-date MAC. >> >> The VLANs should continue inheriting the MAC address of the physical >> interface until the VLAN MAC address is explicitly set to any value. >> This allows IPv6 EUI64 addresses for the VLAN to reflect any changes >> to the MAC of the physical interface and thus for DAD to behave as >> expected. >> >> Signed-off-by: Mike Manning <mmann...@brocade.com> >> --- >> net/8021q/vlan.c |7 +++ >> net/8021q/vlan_dev.c | 14 ++ >> 2 files changed, 17 insertions(+), 4 deletions(-) >> >> --- a/net/8021q/vlan.c >> +++ b/net/8021q/vlan.c >> @@ -291,6 +291,12 @@ static void vlan_sync_address(struct net >> if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr)) >> return; >> >> +/* vlan continues to inherit address of parent interface */ >> +if (vlandev->addr_assign_type == NET_ADDR_STOLEN) { >> +ether_addr_copy(vlandev->dev_addr, dev->dev_addr); >> +goto out; >> +} >> + > > I might have missed something in the previous discussion but as > ether_addr_copy() is just an optimized memcpy(), how is this going to > handle the setups where the vlan device itself has an upper device? For > example, > > - if it is a bridge port, how is the bridge going to learn about its > address change so that it can update its FDB? > - if it is a bond slave or team port, current code preserves the vlan > device address on real device change so everything is fine; your > proposal would change vlan device's address without bond being even > notified, I believe > - there might be a macvlan on top of the vlan and you might > accidentally match its address with the new one > Thanks for your review and this excellent catch. I will add address notification for the vlan itself and test appropriately for when an upper device is present. >> /* vlan address was different from the old address and is equal to >> * the new address */ >> if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && >> @@ -303,6 +309,7 @@ static void vlan_sync_address(struct net >> !ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) >> dev_uc_add(dev, vlandev->dev_addr); >> >> +out: >> ether_addr_copy(vlan->real_dev_addr, dev->dev_addr); >> } >> >> --- a/net/8021q/vlan_dev.c >> +++ b/net/8021q/vlan_dev.c >> @@ -255,9 +255,13 @@ static int vlan_dev_open(struct net_devi >> return -ENETDOWN; >> >> if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr)) { >> -err = dev_uc_add(real_dev, dev->dev_addr); >> -if (err < 0) >> -goto out; >> +if (dev->addr_assign_type == NET_ADDR_STOLEN) { >> +ether_addr_copy(dev->dev_addr, real_dev->dev_addr); > > The same question here. > >> +} else { >> +err = dev_uc_add(real_dev, dev->dev_addr); >> +if (err < 0) >> +goto out; >> +} >> } >> >> if (dev->flags & IFF_ALLMULTI) { >> @@ -558,8 +562,10 @@ static int vlan_dev_init(struct net_devi >> /* ipv6 shared card related stuff */ >> dev->dev_id = real_dev->dev_id; >> >> -if (is_zero_ether_addr(dev->dev_addr)) >> +if (is_zero_ether_addr(dev->dev_addr)) { >> eth_hw_addr_inherit(dev, real_dev); >> +dev->addr_assign_type = NET_ADDR_STOLEN; > > You might want to replace eth_hw_addr_inherit() with ether_addr_copy() > here as they only differ in the former copying addr_assign_type which > you are going to rewrite anyway. (But as both are most likely inlined, > I would expect the resulting code to be the same in the end.) > >Michal Kubecek > Thanks. Yes, I was aware of this but decided not to change it to keep the changeset to a minimum. I will make the change as recommended. >> +} >> if (is_zero_ether_addr(dev->broadcast)) >> memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len); >> >> -- >> 1.7.10.4 >> >> >> >> >> >> >> >> >> >> >> >> >> >> >> >> >> >> >> >> >> >> >>
[PATCH net v4] vlan: Propagate MAC address to VLANs
The MAC address of the physical interface is only copied to the VLAN when it is first created, resulting in an inconsistency after MAC address changes of only newly created VLANs having an up-to-date MAC. The VLANs should continue inheriting the MAC address of the physical interface until the VLAN MAC address is explicitly set to any value. This allows IPv6 EUI64 addresses for the VLAN to reflect any changes to the MAC of the physical interface and thus for DAD to behave as expected. Signed-off-by: Mike Manning <mmann...@brocade.com> --- net/8021q/vlan.c |7 +++ net/8021q/vlan_dev.c | 14 ++ 2 files changed, 17 insertions(+), 4 deletions(-) --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -291,6 +291,12 @@ static void vlan_sync_address(struct net if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr)) return; + /* vlan continues to inherit address of parent interface */ + if (vlandev->addr_assign_type == NET_ADDR_STOLEN) { + ether_addr_copy(vlandev->dev_addr, dev->dev_addr); + goto out; + } + /* vlan address was different from the old address and is equal to * the new address */ if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && @@ -303,6 +309,7 @@ static void vlan_sync_address(struct net !ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) dev_uc_add(dev, vlandev->dev_addr); +out: ether_addr_copy(vlan->real_dev_addr, dev->dev_addr); } --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -255,9 +255,13 @@ static int vlan_dev_open(struct net_devi return -ENETDOWN; if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr)) { - err = dev_uc_add(real_dev, dev->dev_addr); - if (err < 0) - goto out; + if (dev->addr_assign_type == NET_ADDR_STOLEN) { + ether_addr_copy(dev->dev_addr, real_dev->dev_addr); + } else { + err = dev_uc_add(real_dev, dev->dev_addr); + if (err < 0) + goto out; + } } if (dev->flags & IFF_ALLMULTI) { @@ -558,8 +562,10 @@ static int vlan_dev_init(struct net_devi /* ipv6 shared card related stuff */ dev->dev_id = real_dev->dev_id; - if (is_zero_ether_addr(dev->dev_addr)) + if (is_zero_ether_addr(dev->dev_addr)) { eth_hw_addr_inherit(dev, real_dev); + dev->addr_assign_type = NET_ADDR_STOLEN; + } if (is_zero_ether_addr(dev->broadcast)) memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len); -- 1.7.10.4
Re: [PATCH net v3] vlan: Propagate MAC address to VLANs
On 05/06/2016 08:48 PM, Alexander Duyck wrote: > On Fri, May 6, 2016 at 12:36 PM, Mike Manning <mmann...@brocade.com> wrote: >> On 05/06/2016 06:02 PM, Alexander Duyck wrote: >>> On Fri, May 6, 2016 at 6:26 AM, Mike Manning <mmann...@brocade.com> wrote: >>>> The MAC address of the physical interface is only copied to the VLAN >>>> when it is first created, resulting in an inconsistency after MAC >>>> address changes of only newly created VLANs having an up-to-date MAC. >>>> >>>> The VLANs should continue inheriting the MAC address of the physical >>>> interface, unless explicitly changed to be different from this. >>>> This allows IPv6 EUI64 addresses for the VLAN to reflect any changes >>>> to the MAC of the physical interface and thus for DAD to behave as >>>> expected. >>>> >>>> Signed-off-by: Mike Manning <mmann...@brocade.com> >>>> --- >>>> include/linux/if_vlan.h |2 ++ >>>> net/8021q/vlan.c| 17 +++-- >>>> net/8021q/vlan_dev.c| 13 ++--- >>>> 3 files changed, 23 insertions(+), 9 deletions(-) >>>> >>>> --- a/include/linux/if_vlan.h >>>> +++ b/include/linux/if_vlan.h >>>> @@ -138,6 +138,7 @@ struct netpoll; >>>> * @flags: device flags >>>> * @real_dev: underlying netdevice >>>> * @real_dev_addr: address of underlying netdevice >>>> + * @addr_assign_type: address assignment type >>>> * @dent: proc dir entry >>>> * @vlan_pcpu_stats: ptr to percpu rx stats >>>> */ >>>> @@ -153,6 +154,7 @@ struct vlan_dev_priv { >>>> >>>> struct net_device *real_dev; >>>> unsigned char real_dev_addr[ETH_ALEN]; >>>> + unsigned char addr_assign_type; >>>> >>>> struct proc_dir_entry *dent; >>>> struct vlan_pcpu_stats __percpu *vlan_pcpu_stats; >>> >>> Please don't start adding new members to structures when it already >>> exists in the net_device. If anything you should be able to drop >>> read_dev_addr if you do this correctly because you shouldn't need to >>> clone the lower dev address to watch for changes. All you will need >>> to do is watch NET_ADDR_STOLEN. >>> >> >> Thanks for the detailed review. I had initially used the existing type >> in net_device, but the problem with this was that it got overwritten to >> NET_ADDR_SET in dev_set_mac_address(), which I was reluctant to modify. >> It would just be a case of setting the type earlier in that function >> (and caching the previous value in case there is an error). >> >> However, based on your later comment, it seems I should not bother with >> the approach I have here, namely that if the VLAN MAC is set to the same >> value as that of the lower device MAC, that is to be considered as >> resetting it and thus for MAC inheritance to resume. Instead, I will just >> make this a 1-shot transition, i.e. the VLAN MAC starts off as inherited, >> and if it is set to anything (even the value of the lower device MAC), >> inheritance is stopped. I agree this makes for a far simpler changeset. >> >> I don't think I can remove real_dev_addr, as that is still needed for >> the existing functionality in vlan_sync_address() to determine if the sync >> should be done, also as a way of caching it for handling in vlan_dev_open(). > > The thing is that logic isn't really needed anymore though if you are > going to be following the lower dev. If you follow the code what it > is doing is adding the address via dev_uc_add if the lower address > moves away from the VLAN address. With your changes you are updating > the VLAN MAC address to the lower value in the NET_ADDR_STOLEN case so > you don't need to add or remove an extra unicast address. If the user > sets the MAC address you can then use the vlandev->dev_addr as the > address you add/remove from the unicast list and you probably don't > need to bother with tracking the lower device state anyway. > I agree that this logic is not needed at all for the NET_ADDR_STOLEN case. However, once the VLAN MAC has been explicitly set, the situation reverts to the existing functionality, whereby real_dev_addr is used to ensure that dev_uc_add/del are not incorrectly called multiple times for the same MACs. As an example, if the lower device MAC is different from the V
Re: [PATCH net v3] vlan: Propagate MAC address to VLANs
On 05/06/2016 06:02 PM, Alexander Duyck wrote: > On Fri, May 6, 2016 at 6:26 AM, Mike Manning <mmann...@brocade.com> wrote: >> The MAC address of the physical interface is only copied to the VLAN >> when it is first created, resulting in an inconsistency after MAC >> address changes of only newly created VLANs having an up-to-date MAC. >> >> The VLANs should continue inheriting the MAC address of the physical >> interface, unless explicitly changed to be different from this. >> This allows IPv6 EUI64 addresses for the VLAN to reflect any changes >> to the MAC of the physical interface and thus for DAD to behave as >> expected. >> >> Signed-off-by: Mike Manning <mmann...@brocade.com> >> --- >> include/linux/if_vlan.h |2 ++ >> net/8021q/vlan.c| 17 +++-- >> net/8021q/vlan_dev.c| 13 ++--- >> 3 files changed, 23 insertions(+), 9 deletions(-) >> >> --- a/include/linux/if_vlan.h >> +++ b/include/linux/if_vlan.h >> @@ -138,6 +138,7 @@ struct netpoll; >> * @flags: device flags >> * @real_dev: underlying netdevice >> * @real_dev_addr: address of underlying netdevice >> + * @addr_assign_type: address assignment type >> * @dent: proc dir entry >> * @vlan_pcpu_stats: ptr to percpu rx stats >> */ >> @@ -153,6 +154,7 @@ struct vlan_dev_priv { >> >> struct net_device *real_dev; >> unsigned char real_dev_addr[ETH_ALEN]; >> + unsigned char addr_assign_type; >> >> struct proc_dir_entry *dent; >> struct vlan_pcpu_stats __percpu *vlan_pcpu_stats; > > Please don't start adding new members to structures when it already > exists in the net_device. If anything you should be able to drop > read_dev_addr if you do this correctly because you shouldn't need to > clone the lower dev address to watch for changes. All you will need > to do is watch NET_ADDR_STOLEN. > Thanks for the detailed review. I had initially used the existing type in net_device, but the problem with this was that it got overwritten to NET_ADDR_SET in dev_set_mac_address(), which I was reluctant to modify. It would just be a case of setting the type earlier in that function (and caching the previous value in case there is an error). However, based on your later comment, it seems I should not bother with the approach I have here, namely that if the VLAN MAC is set to the same value as that of the lower device MAC, that is to be considered as resetting it and thus for MAC inheritance to resume. Instead, I will just make this a 1-shot transition, i.e. the VLAN MAC starts off as inherited, and if it is set to anything (even the value of the lower device MAC), inheritance is stopped. I agree this makes for a far simpler changeset. I don't think I can remove real_dev_addr, as that is still needed for the existing functionality in vlan_sync_address() to determine if the sync should be done, also as a way of caching it for handling in vlan_dev_open(). As a matter of interest, what is the advantage of not updating the VLAN MAC when it is down? I appreciate that one should not add/delete secondary unicast addresses in this case, but there is no such restriction for copying the MAC. >> --- a/net/8021q/vlan.c >> +++ b/net/8021q/vlan.c >> @@ -291,6 +291,15 @@ static void vlan_sync_address(struct net >> if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr)) >> return; >> >> + /* vlan continues to inherit address of parent interface */ >> + if (vlan->addr_assign_type == NET_ADDR_STOLEN) { >> + ether_addr_copy(vlandev->dev_addr, dev->dev_addr); >> + goto out; >> + } >> + >> + if (!(vlandev->flags & IFF_UP)) >> + goto out; >> + >> /* vlan address was different from the old address and is equal to >> * the new address */ >> if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && >> @@ -303,6 +312,7 @@ static void vlan_sync_address(struct net >> !ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) >> dev_uc_add(dev, vlandev->dev_addr); >> >> +out: >> ether_addr_copy(vlan->real_dev_addr, dev->dev_addr); >> } >> >> @@ -389,13 +399,8 @@ static int vlan_device_event(struct noti >> >> case NETDEV_CHANGEADDR: >> /* Adjust unicast filters on underlying device */ >> - vlan_group_for_each_dev(g
[PATCH net v3] vlan: Propagate MAC address to VLANs
The MAC address of the physical interface is only copied to the VLAN when it is first created, resulting in an inconsistency after MAC address changes of only newly created VLANs having an up-to-date MAC. The VLANs should continue inheriting the MAC address of the physical interface, unless explicitly changed to be different from this. This allows IPv6 EUI64 addresses for the VLAN to reflect any changes to the MAC of the physical interface and thus for DAD to behave as expected. Signed-off-by: Mike Manning <mmann...@brocade.com> --- include/linux/if_vlan.h |2 ++ net/8021q/vlan.c| 17 +++-- net/8021q/vlan_dev.c| 13 ++--- 3 files changed, 23 insertions(+), 9 deletions(-) --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -138,6 +138,7 @@ struct netpoll; * @flags: device flags * @real_dev: underlying netdevice * @real_dev_addr: address of underlying netdevice + * @addr_assign_type: address assignment type * @dent: proc dir entry * @vlan_pcpu_stats: ptr to percpu rx stats */ @@ -153,6 +154,7 @@ struct vlan_dev_priv { struct net_device *real_dev; unsigned char real_dev_addr[ETH_ALEN]; + unsigned char addr_assign_type; struct proc_dir_entry *dent; struct vlan_pcpu_stats __percpu *vlan_pcpu_stats; --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -291,6 +291,15 @@ static void vlan_sync_address(struct net if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr)) return; + /* vlan continues to inherit address of parent interface */ + if (vlan->addr_assign_type == NET_ADDR_STOLEN) { + ether_addr_copy(vlandev->dev_addr, dev->dev_addr); + goto out; + } + + if (!(vlandev->flags & IFF_UP)) + goto out; + /* vlan address was different from the old address and is equal to * the new address */ if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && @@ -303,6 +312,7 @@ static void vlan_sync_address(struct net !ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) dev_uc_add(dev, vlandev->dev_addr); +out: ether_addr_copy(vlan->real_dev_addr, dev->dev_addr); } @@ -389,13 +399,8 @@ static int vlan_device_event(struct noti case NETDEV_CHANGEADDR: /* Adjust unicast filters on underlying device */ - vlan_group_for_each_dev(grp, i, vlandev) { - flgs = vlandev->flags; - if (!(flgs & IFF_UP)) - continue; - + vlan_group_for_each_dev(grp, i, vlandev) vlan_sync_address(dev, vlandev); - } break; case NETDEV_CHANGEMTU: --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -315,17 +315,21 @@ static int vlan_dev_stop(struct net_devi static int vlan_dev_set_mac_address(struct net_device *dev, void *p) { - struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; + struct vlan_dev_priv *vlan = vlan_dev_priv(dev); + struct net_device *real_dev = vlan->real_dev; struct sockaddr *addr = p; + bool is_real_addr; int err; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; + is_real_addr = ether_addr_equal(addr->sa_data, real_dev->dev_addr); + if (!(dev->flags & IFF_UP)) goto out; - if (!ether_addr_equal(addr->sa_data, real_dev->dev_addr)) { + if (!is_real_addr) { err = dev_uc_add(real_dev, addr->sa_data); if (err < 0) return err; @@ -336,6 +340,7 @@ static int vlan_dev_set_mac_address(stru out: ether_addr_copy(dev->dev_addr, addr->sa_data); + vlan->addr_assign_type = is_real_addr ? NET_ADDR_STOLEN : NET_ADDR_SET; return 0; } @@ -558,8 +563,10 @@ static int vlan_dev_init(struct net_devi /* ipv6 shared card related stuff */ dev->dev_id = real_dev->dev_id; - if (is_zero_ether_addr(dev->dev_addr)) + if (is_zero_ether_addr(dev->dev_addr)) { eth_hw_addr_inherit(dev, real_dev); + vlan_dev_priv(dev)->addr_assign_type = NET_ADDR_STOLEN; + } if (is_zero_ether_addr(dev->broadcast)) memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len); -- 1.7.10.4
Re: [PATCH net] vlan: Propagate MAC address changes properly
On 05/03/2016 05:16 AM, David Miller wrote: > From: Mike Manning <mmann...@brocade.com> > Date: Sat, 30 Apr 2016 11:32:37 +0100 > >> The MAC address of the physical interface is only copied to the VLAN >> when it is first created, resulting in an inconsistency after MAC >> address changes of only newly created VLANs having an up-to-date MAC. >> >> Continuing to inherit the MAC address unless explicitly changed for >> the VLAN allows IPv6 EUI64 addresses for the VLAN to reflect the change >> and thus for DAD to behave as expected for the given MAC. >> >> Signed-off-by: Mike Manning <mmann...@brocade.com> > > What is this code really trying to achieve? > > Is it "Propagate real device MAC changes to undelying vlan device, > but not if the user set the vlan MAC explicitly."? Right, I will update the subject header to make this clearer > > If so, implement that instead of all of these confusing tests. > > If the vlan device's set_mac_address operation is ever called, > set a boolean value in the vlan device private to true and test > it here. > Given that this information is implicit in real_dev_addr, I am reluctant to add another member to the vlan_dev_priv data structure, especially given that there may be a large number of VLANs. Instead I have added a variable real_addr_in_use in vlan_sync_address() to make this clearer.
[PATCH net v2] vlan: Propagate MAC address to VLANs unless explicitly set
The MAC address of the physical interface is only copied to the VLAN when it is first created, resulting in an inconsistency after MAC address changes of only newly created VLANs having an up-to-date MAC. Continuing to inherit the MAC address unless explicitly changed for the VLAN allows IPv6 EUI64 addresses for the VLAN to reflect the change and thus for DAD to behave as expected for the given MAC. Signed-off-by: Mike Manning <mmann...@brocade.com> --- net/8021q/vlan.c | 22 ++ 1 file changed, 10 insertions(+), 12 deletions(-) --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -286,22 +286,25 @@ static void vlan_sync_address(struct net struct net_device *vlandev) { struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); + bool real_addr_in_use; /* May be called without an actual change */ if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr)) return; - /* vlan address was different from the old address and is equal to + real_addr_in_use = + ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr); + + /* vlan address was different from the real address and is equal to * the new address */ - if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && + if ((vlandev->flags & IFF_UP) && !real_addr_in_use && ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) dev_uc_del(dev, vlandev->dev_addr); - /* vlan address was equal to the old address and is different from + /* vlan address was equal to the real address so now also inherit * the new address */ - if (ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && - !ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) - dev_uc_add(dev, vlandev->dev_addr); + if (real_addr_in_use) + ether_addr_copy(vlandev->dev_addr, dev->dev_addr); ether_addr_copy(vlan->real_dev_addr, dev->dev_addr); } @@ -389,13 +392,8 @@ static int vlan_device_event(struct noti case NETDEV_CHANGEADDR: /* Adjust unicast filters on underlying device */ - vlan_group_for_each_dev(grp, i, vlandev) { - flgs = vlandev->flags; - if (!(flgs & IFF_UP)) - continue; - + vlan_group_for_each_dev(grp, i, vlandev) vlan_sync_address(dev, vlandev); - } break; case NETDEV_CHANGEMTU: -- 1.7.10.4
[PATCH net] vlan: Propagate MAC address changes properly
The MAC address of the physical interface is only copied to the VLAN when it is first created, resulting in an inconsistency after MAC address changes of only newly created VLANs having an up-to-date MAC. Continuing to inherit the MAC address unless explicitly changed for the VLAN allows IPv6 EUI64 addresses for the VLAN to reflect the change and thus for DAD to behave as expected for the given MAC. Signed-off-by: Mike Manning <mmann...@brocade.com> --- net/8021q/vlan.c | 17 ++--- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index d2cd9de..2f57cf2 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -293,15 +293,15 @@ static void vlan_sync_address(struct net_device *dev, /* vlan address was different from the old address and is equal to * the new address */ - if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && + if ((vlandev->flags & IFF_UP) && + !ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) dev_uc_del(dev, vlandev->dev_addr); - /* vlan address was equal to the old address and is different from + /* vlan address was equal to the old address so now also inherit * the new address */ - if (ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && - !ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) - dev_uc_add(dev, vlandev->dev_addr); + if (ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr)) + ether_addr_copy(vlandev->dev_addr, dev->dev_addr); ether_addr_copy(vlan->real_dev_addr, dev->dev_addr); } @@ -389,13 +389,8 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, case NETDEV_CHANGEADDR: /* Adjust unicast filters on underlying device */ - vlan_group_for_each_dev(grp, i, vlandev) { - flgs = vlandev->flags; - if (!(flgs & IFF_UP)) - continue; - + vlan_group_for_each_dev(grp, i, vlandev) vlan_sync_address(dev, vlandev); - } break; case NETDEV_CHANGEMTU: -- 1.7.10.4
Re: [PATCH net] Propagate MAC address changes to VLANs
On 03/03/2016 09:12 PM, David Miller wrote: > From: Mike Manning <mmann...@brocade.com> > Date: Mon, 29 Feb 2016 11:32:51 + > >> >> -/* vlan address was equal to the old address and is different from >> +/* vlan address was equal to the old address so now also inherit >> * the new address */ >> -if (ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && >> -!ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) >> -dev_uc_add(dev, vlandev->dev_addr); >> +if (ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr)) >> +ether_addr_copy(vlandev->dev_addr, dev->dev_addr); >> > > This dev_uc_add() call removal cannot be correct, if the device is up > we must programe it into the hardware unicast filters and if also > potentially put it into promiscuous mode via __dev_set_rx_mode(). > The call to dev_uc_add() to add a secondary address is only needed if the VLAN MAC is different from that for the physical interface. For the proposed changes, the VLAN MAC is tracking that of the physical interface and so is the same (as typically it does not make sense for these to be different), so dev_uc_add() should not be called. The easiest way to demonstrate equivalence with the original code, where the MAC address has to be set manually, is with some test debugs. Here, first the MAC of the interface itself is changed (so dev_uc_add() is called), then the MAC of the VLAN is changed (so dev_uc_del() is called): 1) ORIGINAL CODE: ip addr show dev dp0s8 | grep ether link/ether 52:54:00:1f:06:2a brd ff:ff:ff:ff:ff:ff ip addr show dev dp0s8.40 | grep ether link/ether 52:54:00:1f:06:2a brd ff:ff:ff:ff:ff:ff sudo ip link set dp0s8.40 addr 10:20:30:40:50:61 sudo ip link set dp0s8 addr 10:20:30:40:50:61 ip addr show dev dp0s8 | grep ether link/ether 10:20:30:40:50:61 brd ff:ff:ff:ff:ff:ff ip addr show dev dp0s8.40 | grep ether link/ether 10:20:30:40:50:61 brd ff:ff:ff:ff:ff:ff [ 3990.332577] --- vlan_dev_set_mac_address: id 40, call dev_uc_add for 10:20:30:40:50:61 on dp 0s8 [ 3990.332579] device dp0s8 entered promiscuous mode [ 4002.425234] 8021q: --- vlan_sync_address: id 40, for 10:20:30:40:50:61 on dp0s8.40 (from dp0 s8) [ 4002.425472] --- vlan_sync_address: id 40, call dev_uc_del for 10:20:30:40:50:61 on dp0s8 [ 4002.425475] --- __hw_addr_del_entry: refcount 0 for 10:20:30:40:50:61 [ 4002.425477] device dp0s8 left promiscuous mode sudo ip link set dp0s8 addr 52:54:00:1f:06:2a ip addr show dev dp0s8.40 | grep ether link/ether 10:20:30:40:50:61 brd ff:ff:ff:ff:ff:ff sudo ip link set dp0s8.40 addr 52:54:00:1f:06:2a ip addr show dev dp0s8 | grep ether link/ether 52:54:00:1f:06:2a brd ff:ff:ff:ff:ff:ff ip addr show dev dp0s8.40 | grep ether link/ether 52:54:00:1f:06:2a brd ff:ff:ff:ff:ff:ff [ 4121.606671] --- vlan_sync_address: id, 40, call dev_uc_add for 10:20:30:40:50:61 on dp0s8 [ 4121.606673] device dp0s8 entered promiscuous mode [ 4147.487780] --- vlan_dev_set_mac_address: id 40, for 52:54:00:1f:06:2a on dp0s8 [ 4147.487782] --- vlan_dev_set_mac_address: id 40, call dev_uc_del for 10:20:30:40:50:61 dp0s8 [ 4147.487784] --- __hw_addr_del_entry: refcount 0 for 10:20:30:40:50:61 [ 4147.487786] device dp0s8 left promiscuous mode 2) WITH IMPROVEMENT FOR VLAN MAC TO FOLLOW THAT OF PHYSICAL INTF, UNLESS EXPLICITLY SET: ip addr show dev dp0s8 | grep ether link/ether 52:54:00:1f:06:2a brd ff:ff:ff:ff:ff:ff ip addr show dev dp0s8.40 | grep ether link/ether 52:54:00:1f:06:2a brd ff:ff:ff:ff:ff:ff sudo ip link set dp0s8 addr 10:20:30:40:50:61 ip addr show dev dp0s8 | grep ether link/ether 10:20:30:40:50:61 brd ff:ff:ff:ff:ff:ff ip addr show dev dp0s8.40 | grep ether link/ether 10:20:30:40:50:61 brd ff:ff:ff:ff:ff:ff [ 196.574789] 8021q: --- vlan_sync_address: id 40, for 10:20:30:40:50:61 on dp0s8.40 (from dp0 s8) [ 196.575004] --- vlan_sync_address: id 40, update to 10:20:30:40:50:61 on dp0s8.40 (from dp0s 8) sudo ip link set dp0s8 addr 52:54:00:1f:06:2a ip addr show dev dp0s8 | grep ether link/ether 52:54:00:1f:06:2a brd ff:ff:ff:ff:ff:ff ip addr show dev dp0s8.40 | grep ether link/ether 52:54:00:1f:06:2a brd ff:ff:ff:ff:ff:ff [ 265.683313] 8021q: --- vlan_sync_address: id 40, for 52:54:00:1f:06:2a on dp0s8.40 (from dp0 s8) [ 265.683534] --- vlan_sync_address: id 40, update to 52:54:00:1f:06:2a on dp0s8.40 (from dp0s 8) sudo ip link set dp0s8.40 addr 10:20:30:40:50:61 sudo ip link set dp0s8 addr 10:20:30:40:50:99 ip addr show dev dp0s8 | grep ether link/ether 10:20:30:40:50:99 brd ff:ff:ff:ff:ff:ff ip addr show dev dp0s8.40 | grep ether link/ether 10:20:30:40:50:61 brd ff:ff:ff:ff:ff:ff sudo ip link set dp0s8 addr 10:20:30:40:50:61 [ 5561.791222] --- vlan_dev_set_mac_address: id 40, for 10:20:30:40:50:61 on dp0s8 [ 5561.791225] --- vlan_dev_