from:"Mike Manning"

[PATCH net-next v5 9/9] ipv6: do not drop vrf udp multicast packets

2018-11-07 Thread Mike Manning

From: Dewi Morgan 

For bound udp sockets in a vrf, also check the sdif to get the index
for ingress devices enslaved to an l3mdev.

Signed-off-by: Dewi Morgan 
Signed-off-by: Mike Manning 
---
 net/ipv6/udp.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 895fa77bde90..a905bf9ed906 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -637,7 +637,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct 
sk_buff *skb)
 static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
   __be16 loc_port, const struct in6_addr 
*loc_addr,
   __be16 rmt_port, const struct in6_addr 
*rmt_addr,
-  int dif, unsigned short hnum)
+  int dif, int sdif, unsigned short hnum)
 {
struct inet_sock *inet = inet_sk(sk);
 
@@ -649,7 +649,7 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct 
sock *sk,
(inet->inet_dport && inet->inet_dport != rmt_port) ||
(!ipv6_addr_any(>sk_v6_daddr) &&
!ipv6_addr_equal(>sk_v6_daddr, rmt_addr)) ||
-   (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) ||
+   !udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif) ||
(!ipv6_addr_any(>sk_v6_rcv_saddr) &&
!ipv6_addr_equal(>sk_v6_rcv_saddr, loc_addr)))
return false;
@@ -683,6 +683,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct 
sk_buff *skb,
unsigned int offset = offsetof(typeof(*sk), sk_node);
unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
int dif = inet6_iif(skb);
+   int sdif = inet6_sdif(skb);
struct hlist_node *node;
struct sk_buff *nskb;
 
@@ -697,7 +698,8 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct 
sk_buff *skb,
 
sk_for_each_entry_offset_rcu(sk, node, >head, offset) {
if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr,
-   uh->source, saddr, dif, hnum))
+   uh->source, saddr, dif, sdif,
+   hnum))
continue;
/* If zero checksum and no_check is not on for
 * the socket then skip it.
-- 
2.11.0

[PATCH net-next v5 8/9] ipv6: handling of multicast packets received in VRF

2018-11-07 Thread Mike Manning

If the skb for multicast packets marked as enslaved to a VRF are
received, then the secondary device index should be used to obtain
the real device. And verify the multicast address against the
enslaved rather than the l3mdev device.

Signed-off-by: Dewi Morgan 
Signed-off-by: Mike Manning 
---
 net/ipv6/ip6_input.c | 35 ---
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 96577e742afd..df58e1100226 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -359,6 +359,8 @@ static int ip6_input_finish(struct net *net, struct sock 
*sk, struct sk_buff *sk
}
} else if (ipprot->flags & INET6_PROTO_FINAL) {
const struct ipv6hdr *hdr;
+   int sdif = inet6_sdif(skb);
+   struct net_device *dev;
 
/* Only do this once for first final protocol */
have_final = true;
@@ -371,9 +373,19 @@ static int ip6_input_finish(struct net *net, struct sock 
*sk, struct sk_buff *sk
skb_postpull_rcsum(skb, skb_network_header(skb),
   skb_network_header_len(skb));
hdr = ipv6_hdr(skb);
+
+   /* skb->dev passed may be master dev for vrfs. */
+   if (sdif) {
+   dev = dev_get_by_index_rcu(net, sdif);
+   if (!dev)
+   goto discard;
+   } else {
+   dev = skb->dev;
+   }
+
if (ipv6_addr_is_multicast(>daddr) &&
-   !ipv6_chk_mcast_addr(skb->dev, >daddr,
-   >saddr) &&
+   !ipv6_chk_mcast_addr(dev, >daddr,
+>saddr) &&
!ipv6_is_mld(skb, nexthdr, 
skb_network_header_len(skb)))
goto discard;
}
@@ -432,15 +444,32 @@ EXPORT_SYMBOL_GPL(ip6_input);
 
 int ip6_mc_input(struct sk_buff *skb)
 {
+   int sdif = inet6_sdif(skb);
const struct ipv6hdr *hdr;
+   struct net_device *dev;
bool deliver;
 
__IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev),
 __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST,
 skb->len);
 
+   /* skb->dev passed may be master dev for vrfs. */
+   if (sdif) {
+   rcu_read_lock();
+   dev = dev_get_by_index_rcu(dev_net(skb->dev), sdif);
+   if (!dev) {
+   rcu_read_unlock();
+   kfree_skb(skb);
+   return -ENODEV;
+   }
+   } else {
+   dev = skb->dev;
+   }
+
hdr = ipv6_hdr(skb);
-   deliver = ipv6_chk_mcast_addr(skb->dev, >daddr, NULL);
+   deliver = ipv6_chk_mcast_addr(dev, >daddr, NULL);
+   if (sdif)
+   rcu_read_unlock();
 
 #ifdef CONFIG_IPV6_MROUTE
/*
-- 
2.11.0

[PATCH net-next v5 7/9] ipv6: allow ping to link-local address in VRF

2018-11-07 Thread Mike Manning

If link-local packets are marked as enslaved to a VRF, then to allow
ping to the link-local from a vrf, the error handling for IPV6_PKTINFO
needs to be relaxed to also allow the pkt ipi6_ifindex to be that of a
slave device to the vrf.

Note that the real device also needs to be retrieved in icmp6_iif()
to set the ipv6 flow oif to this for icmp echo reply handling. The
recent commit 24b711edfc34 ("net/ipv6: Fix linklocal to global address
with VRF") takes care of this, so the sdif does not need checking here.

This fix makes ping to link-local consistent with that to global
addresses, in that this can now be done from within the same VRF that
the address is in.

Signed-off-by: Mike Manning 
---
 net/ipv6/ipv6_sockglue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 381ce38940ae..973e215c3114 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -486,7 +486,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, 
int optname,
retv = -EFAULT;
break;
}
-   if (sk->sk_bound_dev_if && pkt.ipi6_ifindex != 
sk->sk_bound_dev_if)
+   if (!sk_dev_equal_l3scope(sk, pkt.ipi6_ifindex))
goto e_inval;
 
np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex;
-- 
2.11.0

[PATCH net-next v5 4/9] net: provide a sysctl raw_l3mdev_accept for raw socket lookup with VRFs

2018-11-07 Thread Mike Manning

Add a sysctl raw_l3mdev_accept to control raw socket lookup in a manner
similar to use of tcp_l3mdev_accept for stream and of udp_l3mdev_accept
for datagram sockets. Have this default to enabled for reasons of
backwards compatibility. This is so as to specify the output device
with cmsg and IP_PKTINFO, but using a socket not bound to the
corresponding VRF. This allows e.g. older ping implementations to be
run with specifying the device but without executing it in the VRF.
If the option is disabled, packets received in a VRF context are only
handled by a raw socket bound to the VRF, and correspondingly packets
in the default VRF are only handled by a socket not bound to any VRF.

Signed-off-by: Mike Manning 
---
 Documentation/networking/ip-sysctl.txt | 12 
 Documentation/networking/vrf.txt   | 13 +
 include/net/netns/ipv4.h   |  3 +++
 include/net/raw.h  |  1 +
 net/ipv4/af_inet.c |  2 ++
 net/ipv4/raw.c | 28 ++--
 net/ipv4/sysctl_net_ipv4.c | 11 +++
 7 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index 163b5ff1073c..e0e72e2ff6b2 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -359,6 +359,7 @@ tcp_l3mdev_accept - BOOLEAN
derived from the listen socket to be bound to the L3 domain in
which the packets originated. Only valid when the kernel was
compiled with CONFIG_NET_L3_MASTER_DEV.
+Default: 0 (disabled)
 
 tcp_low_latency - BOOLEAN
This is a legacy option, it has no effect anymore.
@@ -762,6 +763,7 @@ udp_l3mdev_accept - BOOLEAN
being received regardless of the L3 domain in which they
originated. Only valid when the kernel was compiled with
CONFIG_NET_L3_MASTER_DEV.
+Default: 0 (disabled)
 
 udp_mem - vector of 3 INTEGERs: min, pressure, max
Number of pages allowed for queueing by all UDP sockets.
@@ -788,6 +790,16 @@ udp_wmem_min - INTEGER
total pages of UDP sockets exceed udp_mem pressure. The unit is byte.
Default: 4K
 
+RAW variables:
+
+raw_l3mdev_accept - BOOLEAN
+   Enabling this option allows a "global" bound socket to work
+   across L3 master domains (e.g., VRFs) with packets capable of
+   being received regardless of the L3 domain in which they
+   originated. Only valid when the kernel was compiled with
+   CONFIG_NET_L3_MASTER_DEV.
+   Default: 1 (enabled)
+
 CIPSOv4 Variables:
 
 cipso_cache_enable - BOOLEAN
diff --git a/Documentation/networking/vrf.txt b/Documentation/networking/vrf.txt
index d4b129402d57..a5f103b083a0 100644
--- a/Documentation/networking/vrf.txt
+++ b/Documentation/networking/vrf.txt
@@ -111,9 +111,22 @@ the same port if they bind to an l3mdev.
 TCP & UDP services running in the default VRF context (ie., not bound
 to any VRF device) can work across all VRF domains by enabling the
 tcp_l3mdev_accept and udp_l3mdev_accept sysctl options:
+
 sysctl -w net.ipv4.tcp_l3mdev_accept=1
 sysctl -w net.ipv4.udp_l3mdev_accept=1
 
+These options are disabled by default so that a socket in a VRF is only
+selected for packets in that VRF. There is a similar option for RAW
+sockets, which is enabled by default for reasons of backwards compatibility.
+This is so as to specify the output device with cmsg and IP_PKTINFO, but
+using a socket not bound to the corresponding VRF. This allows e.g. older ping
+implementations to be run with specifying the device but without executing it
+in the VRF. This option can be disabled so that packets received in a VRF
+context are only handled by a raw socket bound to the VRF, and packets in the
+default VRF are only handled by a socket not bound to any VRF:
+
+sysctl -w net.ipv4.raw_l3mdev_accept=0
+
 netfilter rules on the VRF device can be used to limit access to services
 running in the default VRF context as well.
 
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index e47503b4e4d1..104a6669e344 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -103,6 +103,9 @@ struct netns_ipv4 {
/* Shall we try to damage output packets if routing dev changes? */
int sysctl_ip_dynaddr;
int sysctl_ip_early_demux;
+#ifdef CONFIG_NET_L3_MASTER_DEV
+   int sysctl_raw_l3mdev_accept;
+#endif
int sysctl_tcp_early_demux;
int sysctl_udp_early_demux;
 
diff --git a/include/net/raw.h b/include/net/raw.h
index 9c9fa98a91a4..20ebf0b3dfa8 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -61,6 +61,7 @@ void raw_seq_stop(struct seq_file *seq, void *v);
 
 int raw_hash_sk(struct sock *sk);
 void raw_unhash_sk(struct sock *sk);
+void raw_init(void);
 
 struct raw_sock {
/* inet_sock has to be the first member */
diff --git a/net/ipv4/af_ine

[PATCH net-next v5 5/9] net: fix raw socket lookup device bind matching with VRFs

2018-11-07 Thread Mike Manning

From: Duncan Eastoe 

When there exist a pair of raw sockets one unbound and one bound
to a VRF but equal in all other respects, when a packet is received
in the VRF context, __raw_v4_lookup() matches on both sockets.

This results in the packet being delivered over both sockets,
instead of only the raw socket bound to the VRF. The bound device
checks in __raw_v4_lookup() are replaced with a call to
raw_sk_bound_dev_eq() which correctly handles whether the packet
should be delivered over the unbound socket in such cases.

In __raw_v6_lookup() the match on the device binding of the socket is
similarly updated to use raw_sk_bound_dev_eq() which matches the
handling in __raw_v4_lookup().

Importantly raw_sk_bound_dev_eq() takes the raw_l3mdev_accept sysctl
into account.

Signed-off-by: Duncan Eastoe 
Signed-off-by: Mike Manning 
---
 include/net/raw.h | 13 -
 net/ipv4/raw.c|  3 +--
 net/ipv6/raw.c|  5 ++---
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/include/net/raw.h b/include/net/raw.h
index 20ebf0b3dfa8..821ff4887f77 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -17,7 +17,7 @@
 #ifndef _RAW_H
 #define _RAW_H
 
-
+#include 
 #include 
 #include 
 
@@ -75,4 +75,15 @@ static inline struct raw_sock *raw_sk(const struct sock *sk)
return (struct raw_sock *)sk;
 }
 
+static inline bool raw_sk_bound_dev_eq(struct net *net, int bound_dev_if,
+  int dif, int sdif)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+   return inet_bound_dev_eq(!!net->ipv4.sysctl_raw_l3mdev_accept,
+bound_dev_if, dif, sdif);
+#else
+   return inet_bound_dev_eq(true, bound_dev_if, dif, sdif);
+#endif
+}
+
 #endif /* _RAW_H */
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 1ebd29abe79c..fb1f02015a15 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -131,8 +131,7 @@ struct sock *__raw_v4_lookup(struct net *net, struct sock 
*sk,
if (net_eq(sock_net(sk), net) && inet->inet_num == num  &&
!(inet->inet_daddr && inet->inet_daddr != raddr)&&
!(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
-   !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif &&
- sk->sk_bound_dev_if != sdif))
+   raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
goto found; /* gotcha */
}
sk = NULL;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 5e0efd3954e9..aed7eb5c2123 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -86,9 +86,8 @@ struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
!ipv6_addr_equal(>sk_v6_daddr, rmt_addr))
continue;
 
-   if (sk->sk_bound_dev_if &&
-   sk->sk_bound_dev_if != dif &&
-   sk->sk_bound_dev_if != sdif)
+   if (!raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+dif, sdif))
continue;
 
if (!ipv6_addr_any(>sk_v6_rcv_saddr)) {
-- 
2.11.0

[PATCH net-next v5 0/9] vrf: allow simultaneous service instances in default and other VRFs

2018-11-07 Thread Mike Manning

Services currently have to be VRF-aware if they are using an unbound
socket. One cannot have multiple service instances running in the
default and other VRFs for services that are not VRF-aware and listen
on an unbound socket. This is because there is no easy way of isolating
packets received in the default VRF from those arriving in other VRFs.

This series provides this isolation for stream sockets subject to the
existing kernel parameter net.ipv4.tcp_l3mdev_accept not being set,
given that this is documented as allowing a single service instance to
work across all VRF domains. Similarly, net.ipv4.udp_l3mdev_accept is
checked for datagram sockets, and net.ipv4.raw_l3mdev_accept is
introduced for raw sockets. The functionality applies to UDP & TCP
services as well as those using raw sockets, and is for IPv4 and IPv6.

Example of running ssh instances in default and blue VRF:

$ /usr/sbin/sshd -D
$ ip vrf exec vrf-blue /usr/sbin/sshd
$ ss -ta | egrep 'State|ssh'
State   Recv-Q   Send-Q   Local Address:Port   Peer Address:Port
LISTEN  0128   0.0.0.0%vrf-blue:ssh 0.0.0.0:*
LISTEN  01280.0.0.0:ssh 0.0.0.0:*
ESTAB   00  192.168.122.220:ssh   192.168.122.1:50282
LISTEN  0128  [::]%vrf-blue:ssh[::]:*
LISTEN  0128   [::]:ssh[::]:*
ESTAB   00   [3000::2]%vrf-blue:ssh   [3000::9]:45896
ESTAB   00[2000::2]:ssh   [2000::9]:46398

v1:
   - Address Paolo Abeni's comments (patch 4/5)
   - Fix build when CONFIG_NET_L3_MASTER_DEV not defined (patch 1/5)
v2:
   - Address David Aherns' comments (patches 4/5 and 5/5)
   - Remove patches 3/5 and 5/5 from series for individual submissions
   - Include a sysctl for raw sockets as recommended by David Ahern
   - Expand series into 10 patches and provide improved descriptions
v3:
   - Update description for patch 1/10 and remove patch 6/10
v4:
   - Set default to enabled for raw socket sysctl as recommended by David Ahern
v5:
   - Address review comments from David Ahern in patches 2-5

Dewi Morgan (1):
  ipv6: do not drop vrf udp multicast packets

Duncan Eastoe (1):
  net: fix raw socket lookup device bind matching with VRFs

Mike Manning (6):
  net: ensure unbound stream socket to be chosen when not in a VRF
  net: ensure unbound datagram socket to be chosen when not in a VRF
  net: provide a sysctl raw_l3mdev_accept for raw socket lookup with
VRFs
  vrf: mark skb for multicast or link-local as enslaved to VRF
  ipv6: allow ping to link-local address in VRF
  ipv6: handling of multicast packets received in VRF

Robert Shearman (1):
  net: allow binding socket in a VRF when there's an unbound socket

 Documentation/networking/ip-sysctl.txt | 12 
 Documentation/networking/vrf.txt   | 22 +
 drivers/net/vrf.c  | 19 +-
 include/net/inet6_hashtables.h |  5 ++---
 include/net/inet_hashtables.h  | 24 ---
 include/net/inet_sock.h| 21 
 include/net/netns/ipv4.h   |  3 +++
 include/net/raw.h  | 14 +-
 include/net/udp.h  | 11 +++
 net/core/sock.c|  2 ++
 net/ipv4/af_inet.c |  2 ++
 net/ipv4/inet_connection_sock.c| 13 ++---
 net/ipv4/inet_hashtables.c | 34 -
 net/ipv4/raw.c | 31 ++
 net/ipv4/sysctl_net_ipv4.c | 11 +++
 net/ipv4/udp.c | 15 ++-
 net/ipv6/datagram.c| 10 +++---
 net/ipv6/inet6_hashtables.c| 14 ++
 net/ipv6/ip6_input.c   | 35 +++---
 net/ipv6/ipv6_sockglue.c   |  2 +-
 net/ipv6/raw.c |  5 ++---
 net/ipv6/udp.c | 22 ++---
 22 files changed, 243 insertions(+), 84 deletions(-)

-- 
2.11.0

[PATCH net-next v5 1/9] net: allow binding socket in a VRF when there's an unbound socket

2018-11-07 Thread Mike Manning

From: Robert Shearman 

Change the inet socket lookup to avoid packets arriving on a device
enslaved to an l3mdev from matching unbound sockets by removing the
wildcard for non sk_bound_dev_if and instead relying on check against
the secondary device index, which will be 0 when the input device is
not enslaved to an l3mdev and so match against an unbound socket and
not match when the input device is enslaved.

Change the socket binding to take the l3mdev into account to allow an
unbound socket to not conflict sockets bound to an l3mdev given the
datapath isolation now guaranteed.

Signed-off-by: Robert Shearman 
Signed-off-by: Mike Manning 
---
 Documentation/networking/vrf.txt |  9 +
 include/net/inet6_hashtables.h   |  5 ++---
 include/net/inet_hashtables.h| 13 ++---
 include/net/inet_sock.h  | 13 +
 net/ipv4/inet_connection_sock.c  | 13 ++---
 net/ipv4/inet_hashtables.c   | 20 +++-
 6 files changed, 51 insertions(+), 22 deletions(-)

diff --git a/Documentation/networking/vrf.txt b/Documentation/networking/vrf.txt
index 8ff7b4c8f91b..d4b129402d57 100644
--- a/Documentation/networking/vrf.txt
+++ b/Documentation/networking/vrf.txt
@@ -103,6 +103,11 @@ VRF device:
 
 or to specify the output device using cmsg and IP_PKTINFO.
 
+By default the scope of the port bindings for unbound sockets is
+limited to the default VRF. That is, it will not be matched by packets
+arriving on interfaces enslaved to an l3mdev and processes may bind to
+the same port if they bind to an l3mdev.
+
 TCP & UDP services running in the default VRF context (ie., not bound
 to any VRF device) can work across all VRF domains by enabling the
 tcp_l3mdev_accept and udp_l3mdev_accept sysctl options:
@@ -112,10 +117,6 @@ tcp_l3mdev_accept and udp_l3mdev_accept sysctl options:
 netfilter rules on the VRF device can be used to limit access to services
 running in the default VRF context as well.
 
-The default VRF does not have limited scope with respect to port bindings.
-That is, if a process does a wildcard bind to a port in the default VRF it
-owns the port across all VRF domains within the network namespace.
-
 

 
 Using iproute2 for VRFs
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index 6e91e38a31da..9db98af46985 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -115,9 +115,8 @@ int inet6_hash(struct sock *sk);
 ((__sk)->sk_family == AF_INET6)&&  \
 ipv6_addr_equal(&(__sk)->sk_v6_daddr, (__saddr))   &&  
\
 ipv6_addr_equal(&(__sk)->sk_v6_rcv_saddr, (__daddr))   &&  \
-(!(__sk)->sk_bound_dev_if  ||  \
-  ((__sk)->sk_bound_dev_if == (__dif)) ||  \
-  ((__sk)->sk_bound_dev_if == (__sdif)))   &&  \
+(((__sk)->sk_bound_dev_if == (__dif))  ||  \
+ ((__sk)->sk_bound_dev_if == (__sdif)))&&  \
 net_eq(sock_net(__sk), (__net)))
 
 #endif /* _INET6_HASHTABLES_H */
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 9141e95529e7..4ae060b4bac2 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -79,6 +79,7 @@ struct inet_ehash_bucket {
 
 struct inet_bind_bucket {
possible_net_t  ib_net;
+   int l3mdev;
unsigned short  port;
signed char fastreuse;
signed char fastreuseport;
@@ -191,7 +192,7 @@ static inline void inet_ehash_locks_free(struct 
inet_hashinfo *hashinfo)
 struct inet_bind_bucket *
 inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
struct inet_bind_hashbucket *head,
-   const unsigned short snum);
+   const unsigned short snum, int l3mdev);
 void inet_bind_bucket_destroy(struct kmem_cache *cachep,
  struct inet_bind_bucket *tb);
 
@@ -282,9 +283,8 @@ static inline struct sock *inet_lookup_listener(struct net 
*net,
 #define INET_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif, 
__sdif) \
(((__sk)->sk_portpair == (__ports)) &&  \
 ((__sk)->sk_addrpair == (__cookie))&&  \
-(!(__sk)->sk_bound_dev_if  ||  \
-  ((__sk)->sk_bound_dev_if == (__dif)) ||  \
-  ((__sk)->sk_bound_dev_if == (__sdif)))   &&  \
+(((__sk)->sk_bound_dev_if == (__dif))  ||  \
+ ((__sk)->sk_bound_dev_if == (__sdif)))&&  \

[PATCH net-next v5 3/9] net: ensure unbound datagram socket to be chosen when not in a VRF

2018-11-07 Thread Mike Manning

Ensure an unbound datagram skt is chosen when not in a VRF. The check
for a device match in compute_score() for UDP must be performed when
there is no device match. For this, a failure is returned when there is
no device match. This ensures that bound sockets are never selected,
even if there is no unbound socket.

Allow IPv6 packets to be sent over a datagram skt bound to a VRF. These
packets are currently blocked, as flowi6_oif was set to that of the
master vrf device, and the ipi6_ifindex is that of the slave device.
Allow these packets to be sent by checking the device with ipi6_ifindex
has the same L3 scope as that of the bound device of the skt, which is
the master vrf device. Note that this check always succeeds if the skt
is unbound.

Even though the right datagram skt is now selected by compute_score(),
a different skt is being returned that is bound to the wrong vrf. The
difference between these and stream sockets is the handling of the skt
option for SO_REUSEPORT. While the handling when adding a skt for reuse
correctly checks that the bound device of the skt is a match, the skts
in the hashslot are already incorrect. So for the same hash, a skt for
the wrong vrf may be selected for the required port. The root cause is
that the skt is immediately placed into a slot when it is created,
but when the skt is then bound using SO_BINDTODEVICE, it remains in the
same slot. The solution is to move the skt to the correct slot by
forcing a rehash.

Signed-off-by: Mike Manning 
---
 include/net/udp.h   | 11 +++
 net/core/sock.c |  2 ++
 net/ipv4/udp.c  | 15 ++-
 net/ipv6/datagram.c | 10 +++---
 net/ipv6/udp.c  | 14 +-
 5 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/include/net/udp.h b/include/net/udp.h
index 9e82cb391dea..a496e441645e 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -252,6 +252,17 @@ static inline int udp_rqueue_get(struct sock *sk)
return sk_rmem_alloc_get(sk) - READ_ONCE(udp_sk(sk)->forward_deficit);
 }
 
+static inline bool udp_sk_bound_dev_eq(struct net *net, int bound_dev_if,
+  int dif, int sdif)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+   return inet_bound_dev_eq(!!net->ipv4.sysctl_udp_l3mdev_accept,
+bound_dev_if, dif, sdif);
+#else
+   return inet_bound_dev_eq(true, bound_dev_if, dif, sdif);
+#endif
+}
+
 /* net/ipv4/udp.c */
 void udp_destruct_sock(struct sock *sk);
 void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len);
diff --git a/net/core/sock.c b/net/core/sock.c
index 7e8796a6a089..2fff1796dcba 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -567,6 +567,8 @@ static int sock_setbindtodevice(struct sock *sk, char 
__user *optval,
 
lock_sock(sk);
sk->sk_bound_dev_if = index;
+   if (sk->sk_prot->rehash)
+   sk->sk_prot->rehash(sk);
sk_dst_reset(sk);
release_sock(sk);
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 07d6fb80f433..2c01d52a8dd9 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -371,6 +371,7 @@ static int compute_score(struct sock *sk, struct net *net,
 {
int score;
struct inet_sock *inet;
+   bool dev_match;
 
if (!net_eq(sock_net(sk), net) ||
udp_sk(sk)->udp_port_hash != hnum ||
@@ -398,15 +399,11 @@ static int compute_score(struct sock *sk, struct net *net,
score += 4;
}
 
-   if (sk->sk_bound_dev_if || exact_dif) {
-   bool dev_match = (sk->sk_bound_dev_if == dif ||
- sk->sk_bound_dev_if == sdif);
-
-   if (!dev_match)
-   return -1;
-   if (sk->sk_bound_dev_if)
-   score += 4;
-   }
+   dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+   dif, sdif);
+   if (!dev_match)
+   return -1;
+   score += 4;
 
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 1ede7a16a0be..bde08aa549f3 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -772,6 +772,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
case IPV6_2292PKTINFO:
{
struct net_device *dev = NULL;
+   int src_idx;
 
if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct 
in6_pktinfo))) {
err = -EINVAL;
@@ -779,12 +780,15 @@ int ip6_datagram_send_ctl(struct net *net, struct sock 
*sk,
}
 
src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg);
+   src_idx = src_info->ipi6_ifindex;
 
-   if (src_info->ipi6_ifindex) {
+   if (src_idx)

[PATCH net-next v5 6/9] vrf: mark skb for multicast or link-local as enslaved to VRF

2018-11-07 Thread Mike Manning

The skb for packets that are multicast or to a link-local address are
not marked as being enslaved to a VRF, if they are received on a socket
bound to the VRF. This is needed for ND and it is preferable for the
kernel not to have to deal with the additional use-cases if ll or mcast
packets are handled as enslaved. However, this does not allow service
instances listening on unbound and bound to VRF sockets to distinguish
the VRF used, if packets are sent as multicast or to a link-local
address. The fix is for the VRF driver to also mark these skb as being
enslaved to the VRF.

Signed-off-by: Mike Manning 
---
 drivers/net/vrf.c | 19 +--
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 69b7227c637e..21ad4b1d7f03 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -981,24 +981,23 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device 
*vrf_dev,
   struct sk_buff *skb)
 {
int orig_iif = skb->skb_iif;
-   bool need_strict;
+   bool need_strict = rt6_need_strict(_hdr(skb)->daddr);
+   bool is_ndisc = ipv6_ndisc_frame(skb);
 
-   /* loopback traffic; do not push through packet taps again.
-* Reset pkt_type for upper layers to process skb
+   /* loopback, multicast & non-ND link-local traffic; do not push through
+* packet taps again. Reset pkt_type for upper layers to process skb
 */
-   if (skb->pkt_type == PACKET_LOOPBACK) {
+   if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) {
skb->dev = vrf_dev;
skb->skb_iif = vrf_dev->ifindex;
IP6CB(skb)->flags |= IP6SKB_L3SLAVE;
-   skb->pkt_type = PACKET_HOST;
+   if (skb->pkt_type == PACKET_LOOPBACK)
+   skb->pkt_type = PACKET_HOST;
goto out;
}
 
-   /* if packet is NDISC or addressed to multicast or link-local
-* then keep the ingress interface
-*/
-   need_strict = rt6_need_strict(_hdr(skb)->daddr);
-   if (!ipv6_ndisc_frame(skb) && !need_strict) {
+   /* if packet is NDISC then keep the ingress interface */
+   if (!is_ndisc) {
vrf_rx_stats(vrf_dev, skb->len);
skb->dev = vrf_dev;
skb->skb_iif = vrf_dev->ifindex;
-- 
2.11.0

[PATCH net-next v5 2/9] net: ensure unbound stream socket to be chosen when not in a VRF

2018-11-07 Thread Mike Manning

The commit a04a480d4392 ("net: Require exact match for TCP socket
lookups if dif is l3mdev") only ensures that the correct socket is
selected for packets in a VRF. However, there is no guarantee that
the unbound socket will be selected for packets when not in a VRF.
By checking for a device match in compute_score() also for the case
when there is no bound device and attaching a score to this, the
unbound socket is selected. And if a failure is returned when there
is no device match, this ensures that bound sockets are never selected,
even if there is no unbound socket.

Signed-off-by: Mike Manning 
---
 include/net/inet_hashtables.h | 11 +++
 include/net/inet_sock.h   |  8 
 net/ipv4/inet_hashtables.c| 14 ++
 net/ipv6/inet6_hashtables.c   | 14 ++
 4 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 4ae060b4bac2..0ce460e93dc4 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -189,6 +189,17 @@ static inline void inet_ehash_locks_free(struct 
inet_hashinfo *hashinfo)
hashinfo->ehash_locks = NULL;
 }
 
+static inline bool inet_sk_bound_dev_eq(struct net *net, int bound_dev_if,
+   int dif, int sdif)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+   return inet_bound_dev_eq(!!net->ipv4.sysctl_tcp_l3mdev_accept,
+bound_dev_if, dif, sdif);
+#else
+   return inet_bound_dev_eq(true, bound_dev_if, dif, sdif);
+#endif
+}
+
 struct inet_bind_bucket *
 inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
struct inet_bind_hashbucket *head,
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index ed3f723af00b..e8eef85006aa 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -143,6 +143,14 @@ static inline int inet_sk_bound_l3mdev(const struct sock 
*sk)
return 0;
 }
 
+static inline bool inet_bound_dev_eq(bool l3mdev_accept, int bound_dev_if,
+int dif, int sdif)
+{
+   if (!bound_dev_if)
+   return !sdif || l3mdev_accept;
+   return bound_dev_if == dif || bound_dev_if == sdif;
+}
+
 struct inet_cork {
unsigned intflags;
__be32  addr;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 260531dc6458..2ec684057ebd 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -235,6 +235,7 @@ static inline int compute_score(struct sock *sk, struct net 
*net,
 {
int score = -1;
struct inet_sock *inet = inet_sk(sk);
+   bool dev_match;
 
if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
!ipv6_only_sock(sk)) {
@@ -245,15 +246,12 @@ static inline int compute_score(struct sock *sk, struct 
net *net,
return -1;
score += 4;
}
-   if (sk->sk_bound_dev_if || exact_dif) {
-   bool dev_match = (sk->sk_bound_dev_if == dif ||
- sk->sk_bound_dev_if == sdif);
+   dev_match = inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+dif, sdif);
+   if (!dev_match)
+   return -1;
+   score += 4;
 
-   if (!dev_match)
-   return -1;
-   if (sk->sk_bound_dev_if)
-   score += 4;
-   }
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
}
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 3d7c7460a0c5..5eeeba7181a1 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -99,6 +99,7 @@ static inline int compute_score(struct sock *sk, struct net 
*net,
const int dif, const int sdif, bool exact_dif)
 {
int score = -1;
+   bool dev_match;
 
if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum &&
sk->sk_family == PF_INET6) {
@@ -109,15 +110,12 @@ static inline int compute_score(struct sock *sk, struct 
net *net,
return -1;
score++;
}
-   if (sk->sk_bound_dev_if || exact_dif) {
-   bool dev_match = (sk->sk_bound_dev_if == dif ||
- sk->sk_bound_dev_if == sdif);
+   dev_match = inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+dif, sdif);
+   if (!dev_match)
+   return -1;
+   score

[PATCH net-next v4 2/9] net: ensure unbound stream socket to be chosen when not in a VRF

2018-11-02 Thread Mike Manning

The commit a04a480d4392 ("net: Require exact match for TCP socket
lookups if dif is l3mdev") only ensures that the correct socket is
selected for packets in a VRF. However, there is no guarantee that
the unbound socket will be selected for packets when not in a VRF.
By checking for a device match in compute_score() also for the case
when there is no bound device and attaching a score to this, the
unbound socket is selected. And if a failure is returned when there
is no device match, this ensures that bound sockets are never selected,
even if there is no unbound socket.

Signed-off-by: Mike Manning 
---
 include/net/inet_hashtables.h | 11 +++
 include/net/inet_sock.h   |  8 
 net/ipv4/inet_hashtables.c| 14 ++
 net/ipv6/inet6_hashtables.c   | 14 ++
 4 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 4ae060b4bac2..5de2d9f24c05 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -189,6 +189,17 @@ static inline void inet_ehash_locks_free(struct 
inet_hashinfo *hashinfo)
hashinfo->ehash_locks = NULL;
 }
 
+static inline bool inet_sk_bound_dev_eq(struct net *net, int bound_dev_if,
+   int dif, int sdif)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+   return inet_bound_dev_eq(net->ipv4.sysctl_tcp_l3mdev_accept,
+   bound_dev_if, dif, sdif);
+#else
+   return inet_bound_dev_eq(1, bound_dev_if, dif, sdif);
+#endif
+}
+
 struct inet_bind_bucket *
 inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
struct inet_bind_hashbucket *head,
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index ed3f723af00b..e8eef85006aa 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -143,6 +143,14 @@ static inline int inet_sk_bound_l3mdev(const struct sock 
*sk)
return 0;
 }
 
+static inline bool inet_bound_dev_eq(bool l3mdev_accept, int bound_dev_if,
+int dif, int sdif)
+{
+   if (!bound_dev_if)
+   return !sdif || l3mdev_accept;
+   return bound_dev_if == dif || bound_dev_if == sdif;
+}
+
 struct inet_cork {
unsigned intflags;
__be32  addr;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 40d722ab1738..13890d5bfc34 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -235,6 +235,7 @@ static inline int compute_score(struct sock *sk, struct net 
*net,
 {
int score = -1;
struct inet_sock *inet = inet_sk(sk);
+   bool dev_match;
 
if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
!ipv6_only_sock(sk)) {
@@ -245,15 +246,12 @@ static inline int compute_score(struct sock *sk, struct 
net *net,
return -1;
score += 4;
}
-   if (sk->sk_bound_dev_if || exact_dif) {
-   bool dev_match = (sk->sk_bound_dev_if == dif ||
- sk->sk_bound_dev_if == sdif);
+   dev_match = inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+dif, sdif);
+   if (!dev_match)
+   return -1;
+   score += 4;
 
-   if (!dev_match)
-   return -1;
-   if (sk->sk_bound_dev_if)
-   score += 4;
-   }
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
}
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 3d7c7460a0c5..5eeeba7181a1 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -99,6 +99,7 @@ static inline int compute_score(struct sock *sk, struct net 
*net,
const int dif, const int sdif, bool exact_dif)
 {
int score = -1;
+   bool dev_match;
 
if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum &&
sk->sk_family == PF_INET6) {
@@ -109,15 +110,12 @@ static inline int compute_score(struct sock *sk, struct 
net *net,
return -1;
score++;
}
-   if (sk->sk_bound_dev_if || exact_dif) {
-   bool dev_match = (sk->sk_bound_dev_if == dif ||
- sk->sk_bound_dev_if == sdif);
+   dev_match = inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+dif, sdif);
+   if (!dev_match)
+   return -1;
+   score

[PATCH net-next v4 6/9] vrf: mark skb for multicast or link-local as enslaved to VRF

2018-11-02 Thread Mike Manning

The skb for packets that are multicast or to a link-local address are
not marked as being enslaved to a VRF, if they are received on a socket
bound to the VRF. This is needed for ND and it is preferable for the
kernel not to have to deal with the additional use-cases if ll or mcast
packets are handled as enslaved. However, this does not allow service
instances listening on unbound and bound to VRF sockets to distinguish
the VRF used, if packets are sent as multicast or to a link-local
address. The fix is for the VRF driver to also mark these skb as being
enslaved to the VRF.

Signed-off-by: Mike Manning 
---
 drivers/net/vrf.c | 19 +--
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 69b7227c637e..21ad4b1d7f03 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -981,24 +981,23 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device 
*vrf_dev,
   struct sk_buff *skb)
 {
int orig_iif = skb->skb_iif;
-   bool need_strict;
+   bool need_strict = rt6_need_strict(_hdr(skb)->daddr);
+   bool is_ndisc = ipv6_ndisc_frame(skb);
 
-   /* loopback traffic; do not push through packet taps again.
-* Reset pkt_type for upper layers to process skb
+   /* loopback, multicast & non-ND link-local traffic; do not push through
+* packet taps again. Reset pkt_type for upper layers to process skb
 */
-   if (skb->pkt_type == PACKET_LOOPBACK) {
+   if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) {
skb->dev = vrf_dev;
skb->skb_iif = vrf_dev->ifindex;
IP6CB(skb)->flags |= IP6SKB_L3SLAVE;
-   skb->pkt_type = PACKET_HOST;
+   if (skb->pkt_type == PACKET_LOOPBACK)
+   skb->pkt_type = PACKET_HOST;
goto out;
}
 
-   /* if packet is NDISC or addressed to multicast or link-local
-* then keep the ingress interface
-*/
-   need_strict = rt6_need_strict(_hdr(skb)->daddr);
-   if (!ipv6_ndisc_frame(skb) && !need_strict) {
+   /* if packet is NDISC then keep the ingress interface */
+   if (!is_ndisc) {
vrf_rx_stats(vrf_dev, skb->len);
skb->dev = vrf_dev;
skb->skb_iif = vrf_dev->ifindex;
-- 
2.11.0

[PATCH net-next v4 5/9] net: fix raw socket lookup device bind matching with VRFs

2018-11-02 Thread Mike Manning

From: Duncan Eastoe 

When there exist a pair of raw sockets one unbound and one bound
to a VRF but equal in all other respects, when a packet is received
in the VRF context, __raw_v4_lookup() matches on both sockets.

This results in the packet being delivered over both sockets,
instead of only the raw socket bound to the VRF. The bound device
checks in __raw_v4_lookup() are replaced with a call to
raw_sk_bound_dev_eq() which correctly handles whether the packet
should be delivered over the unbound socket in such cases.

In __raw_v6_lookup() the match on the device binding of the socket is
similarly updated to use raw_sk_bound_dev_eq() which matches the
handling in __raw_v4_lookup().

Importantly raw_sk_bound_dev_eq() takes the raw_l3mdev_accept sysctl
into account.

Signed-off-by: Duncan Eastoe 
Signed-off-by: Mike Manning 
---
 include/net/raw.h | 12 
 net/ipv4/raw.c|  3 +--
 net/ipv6/raw.c|  5 ++---
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/include/net/raw.h b/include/net/raw.h
index 20ebf0b3dfa8..6ed2ae5b4a80 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -18,6 +18,7 @@
 #define _RAW_H
 
 
+#include 
 #include 
 #include 
 
@@ -75,4 +76,15 @@ static inline struct raw_sock *raw_sk(const struct sock *sk)
return (struct raw_sock *)sk;
 }
 
+static inline bool raw_sk_bound_dev_eq(struct net *net, int bound_dev_if,
+  int dif, int sdif)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+   return inet_bound_dev_eq(net->ipv4.sysctl_raw_l3mdev_accept,
+bound_dev_if, dif, sdif);
+#else
+   return inet_bound_dev_eq(1, bound_dev_if, dif, sdif);
+#endif
+}
+
 #endif /* _RAW_H */
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index da453c7dfb75..d42cdd018987 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -131,8 +131,7 @@ struct sock *__raw_v4_lookup(struct net *net, struct sock 
*sk,
if (net_eq(sock_net(sk), net) && inet->inet_num == num  &&
!(inet->inet_daddr && inet->inet_daddr != raddr)&&
!(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
-   !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif &&
- sk->sk_bound_dev_if != sdif))
+   raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
goto found; /* gotcha */
}
sk = NULL;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 5e0efd3954e9..aed7eb5c2123 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -86,9 +86,8 @@ struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
!ipv6_addr_equal(>sk_v6_daddr, rmt_addr))
continue;
 
-   if (sk->sk_bound_dev_if &&
-   sk->sk_bound_dev_if != dif &&
-   sk->sk_bound_dev_if != sdif)
+   if (!raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+dif, sdif))
continue;
 
if (!ipv6_addr_any(>sk_v6_rcv_saddr)) {
-- 
2.11.0

[PATCH net-next v4 0/9] vrf: allow simultaneous service instances in default and other VRFs

2018-11-02 Thread Mike Manning

Services currently have to be VRF-aware if they are using an unbound
socket. One cannot have multiple service instances running in the
default and other VRFs for services that are not VRF-aware and listen
on an unbound socket. This is because there is no easy way of isolating
packets received in the default VRF from those arriving in other VRFs.

This series provides this isolation for stream sockets subject to the
existing kernel parameter net.ipv4.tcp_l3mdev_accept not being set,
given that this is documented as allowing a single service instance to
work across all VRF domains. Similarly, net.ipv4.udp_l3mdev_accept is
checked for datagram sockets, and net.ipv4.raw_l3mdev_accept is
introduced for raw sockets. The functionality applies to UDP & TCP
services as well as those using raw sockets, and is for IPv4 and IPv6.

Example of running ssh instances in default and blue VRF:

$ /usr/sbin/sshd -D
$ ip vrf exec vrf-blue /usr/sbin/sshd
$ ss -ta | egrep 'State|ssh'
State   Recv-Q   Send-Q   Local Address:Port   Peer Address:Port
LISTEN  0128   0.0.0.0%vrf-blue:ssh 0.0.0.0:*
LISTEN  01280.0.0.0:ssh 0.0.0.0:*
ESTAB   00  192.168.122.220:ssh   192.168.122.1:50282
LISTEN  0128  [::]%vrf-blue:ssh[::]:*
LISTEN  0128   [::]:ssh[::]:*
ESTAB   00   [3000::2]%vrf-blue:ssh   [3000::9]:45896
ESTAB   00[2000::2]:ssh   [2000::9]:46398

v1:
   - Address Paolo Abeni's comments (patch 4/5)
   - Fix build when CONFIG_NET_L3_MASTER_DEV not defined (patch 1/5)
v2:
   - Address David Aherns' comments (patches 4/5 and 5/5)
   - Remove patches 3/5 and 5/5 from series for individual submissions
   - Include a sysctl for raw sockets as recommended by David Ahern
   - Expand series into 10 patches and provide improved descriptions
v3:
   - Update description for patch 1/10 and remove patch 6/10
v4:
   - Set default to enabled for raw socket sysctl as recommended by David Ahern

Dewi Morgan (1):
  ipv6: do not drop vrf udp multicast packets

Duncan Eastoe (1):
  net: fix raw socket lookup device bind matching with VRFs

Mike Manning (6):
  net: ensure unbound stream socket to be chosen when not in a VRF
  net: ensure unbound datagram socket to be chosen when not in a VRF
  net: provide a sysctl raw_l3mdev_accept for raw socket lookup with
VRFs
  vrf: mark skb for multicast or link-local as enslaved to VRF
  ipv6: allow ping to link-local address in VRF
  ipv6: handling of multicast packets received in VRF

Robert Shearman (1):
  net: allow binding socket in a VRF when there's an unbound socket

 Documentation/networking/ip-sysctl.txt | 12 
 Documentation/networking/vrf.txt   | 22 +
 drivers/net/vrf.c  | 19 +-
 include/net/inet6_hashtables.h |  5 ++---
 include/net/inet_hashtables.h  | 24 ---
 include/net/inet_sock.h| 21 
 include/net/netns/ipv4.h   |  3 +++
 include/net/raw.h  | 13 +
 include/net/udp.h  | 11 +++
 net/core/sock.c|  2 ++
 net/ipv4/af_inet.c |  2 ++
 net/ipv4/inet_connection_sock.c| 13 ++---
 net/ipv4/inet_hashtables.c | 34 -
 net/ipv4/raw.c | 19 ++
 net/ipv4/sysctl_net_ipv4.c | 11 +++
 net/ipv4/udp.c | 15 ++-
 net/ipv6/datagram.c|  5 -
 net/ipv6/inet6_hashtables.c| 14 ++
 net/ipv6/ip6_input.c   | 35 +++---
 net/ipv6/ipv6_sockglue.c   |  2 +-
 net/ipv6/raw.c |  5 ++---
 net/ipv6/udp.c | 22 ++---
 22 files changed, 228 insertions(+), 81 deletions(-)

-- 
2.11.0

[PATCH net-next v4 7/9] ipv6: allow ping to link-local address in VRF

2018-11-02 Thread Mike Manning

If link-local packets are marked as enslaved to a VRF, then to allow
ping to the link-local from a vrf, the error handling for IPV6_PKTINFO
needs to be relaxed to also allow the pkt ipi6_ifindex to be that of a
slave device to the vrf.

Note that the real device also needs to be retrieved in icmp6_iif()
to set the ipv6 flow oif to this for icmp echo reply handling. The
recent commit 24b711edfc34 ("net/ipv6: Fix linklocal to global address
with VRF") takes care of this, so the sdif does not need checking here.

This fix makes ping to link-local consistent with that to global
addresses, in that this can now be done from within the same VRF that
the address is in.

Signed-off-by: Mike Manning 
---
 net/ipv6/ipv6_sockglue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 381ce38940ae..973e215c3114 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -486,7 +486,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, 
int optname,
retv = -EFAULT;
break;
}
-   if (sk->sk_bound_dev_if && pkt.ipi6_ifindex != 
sk->sk_bound_dev_if)
+   if (!sk_dev_equal_l3scope(sk, pkt.ipi6_ifindex))
goto e_inval;
 
np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex;
-- 
2.11.0

[PATCH net-next v4 8/9] ipv6: handling of multicast packets received in VRF

2018-11-02 Thread Mike Manning

If the skb for multicast packets marked as enslaved to a VRF are
received, then the secondary device index should be used to obtain
the real device. And verify the multicast address against the
enslaved rather than the l3mdev device.

Signed-off-by: Dewi Morgan 
Signed-off-by: Mike Manning 
---
 net/ipv6/ip6_input.c | 35 ---
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 96577e742afd..df58e1100226 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -359,6 +359,8 @@ static int ip6_input_finish(struct net *net, struct sock 
*sk, struct sk_buff *sk
}
} else if (ipprot->flags & INET6_PROTO_FINAL) {
const struct ipv6hdr *hdr;
+   int sdif = inet6_sdif(skb);
+   struct net_device *dev;
 
/* Only do this once for first final protocol */
have_final = true;
@@ -371,9 +373,19 @@ static int ip6_input_finish(struct net *net, struct sock 
*sk, struct sk_buff *sk
skb_postpull_rcsum(skb, skb_network_header(skb),
   skb_network_header_len(skb));
hdr = ipv6_hdr(skb);
+
+   /* skb->dev passed may be master dev for vrfs. */
+   if (sdif) {
+   dev = dev_get_by_index_rcu(net, sdif);
+   if (!dev)
+   goto discard;
+   } else {
+   dev = skb->dev;
+   }
+
if (ipv6_addr_is_multicast(>daddr) &&
-   !ipv6_chk_mcast_addr(skb->dev, >daddr,
-   >saddr) &&
+   !ipv6_chk_mcast_addr(dev, >daddr,
+>saddr) &&
!ipv6_is_mld(skb, nexthdr, 
skb_network_header_len(skb)))
goto discard;
}
@@ -432,15 +444,32 @@ EXPORT_SYMBOL_GPL(ip6_input);
 
 int ip6_mc_input(struct sk_buff *skb)
 {
+   int sdif = inet6_sdif(skb);
const struct ipv6hdr *hdr;
+   struct net_device *dev;
bool deliver;
 
__IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev),
 __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST,
 skb->len);
 
+   /* skb->dev passed may be master dev for vrfs. */
+   if (sdif) {
+   rcu_read_lock();
+   dev = dev_get_by_index_rcu(dev_net(skb->dev), sdif);
+   if (!dev) {
+   rcu_read_unlock();
+   kfree_skb(skb);
+   return -ENODEV;
+   }
+   } else {
+   dev = skb->dev;
+   }
+
hdr = ipv6_hdr(skb);
-   deliver = ipv6_chk_mcast_addr(skb->dev, >daddr, NULL);
+   deliver = ipv6_chk_mcast_addr(dev, >daddr, NULL);
+   if (sdif)
+   rcu_read_unlock();
 
 #ifdef CONFIG_IPV6_MROUTE
/*
-- 
2.11.0

[PATCH net-next v4 4/9] net: provide a sysctl raw_l3mdev_accept for raw socket lookup with VRFs

2018-11-02 Thread Mike Manning

Add a sysctl raw_l3mdev_accept to control raw socket lookup in a manner
similar to use of tcp_l3mdev_accept for stream and of udp_l3mdev_accept
for datagram sockets. Have this default to enabled for reasons of
backwards compatibility. This is so as to specify the output device
with cmsg and IP_PKTINFO, but using a socket not bound to the
corresponding VRF. This allows e.g. older ping implementations to be
run with specifying the device but without executing it in the VRF.
If the option is disabled, packets received in a VRF context are only
handled by a raw socket bound to the VRF, and correspondingly packets
in the default VRF are only handled by a socket not bound to any VRF.

Signed-off-by: Mike Manning 
---
 Documentation/networking/ip-sysctl.txt | 12 
 Documentation/networking/vrf.txt   | 13 +
 include/net/netns/ipv4.h   |  3 +++
 include/net/raw.h  |  1 +
 net/ipv4/af_inet.c |  2 ++
 net/ipv4/raw.c | 16 ++--
 net/ipv4/sysctl_net_ipv4.c | 11 +++
 7 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index 32b21571adfe..aa9e6a331679 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -370,6 +370,7 @@ tcp_l3mdev_accept - BOOLEAN
derived from the listen socket to be bound to the L3 domain in
which the packets originated. Only valid when the kernel was
compiled with CONFIG_NET_L3_MASTER_DEV.
+Default: 0 (disabled)
 
 tcp_low_latency - BOOLEAN
This is a legacy option, it has no effect anymore.
@@ -773,6 +774,7 @@ udp_l3mdev_accept - BOOLEAN
being received regardless of the L3 domain in which they
originated. Only valid when the kernel was compiled with
CONFIG_NET_L3_MASTER_DEV.
+Default: 0 (disabled)
 
 udp_mem - vector of 3 INTEGERs: min, pressure, max
Number of pages allowed for queueing by all UDP sockets.
@@ -799,6 +801,16 @@ udp_wmem_min - INTEGER
total pages of UDP sockets exceed udp_mem pressure. The unit is byte.
Default: 4K
 
+RAW variables:
+
+raw_l3mdev_accept - BOOLEAN
+   Enabling this option allows a "global" bound socket to work
+   across L3 master domains (e.g., VRFs) with packets capable of
+   being received regardless of the L3 domain in which they
+   originated. Only valid when the kernel was compiled with
+   CONFIG_NET_L3_MASTER_DEV.
+   Default: 1 (enabled)
+
 CIPSOv4 Variables:
 
 cipso_cache_enable - BOOLEAN
diff --git a/Documentation/networking/vrf.txt b/Documentation/networking/vrf.txt
index d4b129402d57..d234c9750c72 100644
--- a/Documentation/networking/vrf.txt
+++ b/Documentation/networking/vrf.txt
@@ -111,9 +111,22 @@ the same port if they bind to an l3mdev.
 TCP & UDP services running in the default VRF context (ie., not bound
 to any VRF device) can work across all VRF domains by enabling the
 tcp_l3mdev_accept and udp_l3mdev_accept sysctl options:
+
 sysctl -w net.ipv4.tcp_l3mdev_accept=1
 sysctl -w net.ipv4.udp_l3mdev_accept=1
 
+These options are disabled by default so that a socket in a VRF is only
+selected for packets in that VRF. There is a similar option for RAW
+sockets, which is enabled by default for reasons of backwards compatibility.
+This is so as to specify the output device with cmsg and IP_PKTINFO, but
+using a socket not bound to the corresponding VRF. This allows e.g. older ping
+implementations to be run with specifying the device but without executing it
+in the VRF. This option can be disabled so that packets received in a VRF
+context are only handled by a raw socket bound to the VRF, and packets in the
+in the default VRF are only handled by a socket not bound to any VRF:
+
+sysctl -w net.ipv4.raw_l3mdev_accept=0
+
 netfilter rules on the VRF device can be used to limit access to services
 running in the default VRF context as well.
 
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index e47503b4e4d1..104a6669e344 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -103,6 +103,9 @@ struct netns_ipv4 {
/* Shall we try to damage output packets if routing dev changes? */
int sysctl_ip_dynaddr;
int sysctl_ip_early_demux;
+#ifdef CONFIG_NET_L3_MASTER_DEV
+   int sysctl_raw_l3mdev_accept;
+#endif
int sysctl_tcp_early_demux;
int sysctl_udp_early_demux;
 
diff --git a/include/net/raw.h b/include/net/raw.h
index 9c9fa98a91a4..20ebf0b3dfa8 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -61,6 +61,7 @@ void raw_seq_stop(struct seq_file *seq, void *v);
 
 int raw_hash_sk(struct sock *sk);
 void raw_unhash_sk(struct sock *sk);
+void raw_init(void);
 
 struct raw_sock {
/* inet_sock has to be the first member */
diff --git a/net/ipv4/af_inet.c b/net/

[PATCH net-next v4 9/9] ipv6: do not drop vrf udp multicast packets

2018-11-02 Thread Mike Manning

From: Dewi Morgan 

For bound udp sockets in a vrf, also check the sdif to get the index
for ingress devices enslaved to an l3mdev.

Signed-off-by: Dewi Morgan 
Signed-off-by: Mike Manning 
---
 net/ipv6/udp.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 0559adc2f357..a25571c12a8a 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -637,7 +637,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct 
sk_buff *skb)
 static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
   __be16 loc_port, const struct in6_addr 
*loc_addr,
   __be16 rmt_port, const struct in6_addr 
*rmt_addr,
-  int dif, unsigned short hnum)
+  int dif, int sdif, unsigned short hnum)
 {
struct inet_sock *inet = inet_sk(sk);
 
@@ -649,7 +649,7 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct 
sock *sk,
(inet->inet_dport && inet->inet_dport != rmt_port) ||
(!ipv6_addr_any(>sk_v6_daddr) &&
!ipv6_addr_equal(>sk_v6_daddr, rmt_addr)) ||
-   (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) ||
+   !udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif) ||
(!ipv6_addr_any(>sk_v6_rcv_saddr) &&
!ipv6_addr_equal(>sk_v6_rcv_saddr, loc_addr)))
return false;
@@ -683,6 +683,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct 
sk_buff *skb,
unsigned int offset = offsetof(typeof(*sk), sk_node);
unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
int dif = inet6_iif(skb);
+   int sdif = inet6_sdif(skb);
struct hlist_node *node;
struct sk_buff *nskb;
 
@@ -697,7 +698,8 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct 
sk_buff *skb,
 
sk_for_each_entry_offset_rcu(sk, node, >head, offset) {
if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr,
-   uh->source, saddr, dif, hnum))
+   uh->source, saddr, dif, sdif,
+   hnum))
continue;
/* If zero checksum and no_check is not on for
 * the socket then skip it.
-- 
2.11.0

[PATCH net-next v4 3/9] net: ensure unbound datagram socket to be chosen when not in a VRF

2018-11-02 Thread Mike Manning

Ensure an unbound datagram skt is chosen when not in a VRF. The check
for a device match in compute_score() for UDP must be performed when
there is no device match. For this, a failure is returned when there is
no device match. This ensures that bound sockets are never selected,
even if there is no unbound socket.

Allow IPv6 packets to be sent over a datagram skt bound to a VRF. These
packets are currently blocked, as flowi6_oif was set to that of the
master vrf device, and the ipi6_ifindex is that of the slave device.
Allow these packets to be sent by checking the device with ipi6_ifindex
has the same L3 scope as that of the bound device of the skt, which is
the master vrf device. Note that this check always succeeds if the skt
is unbound.

Even though the right datagram skt is now selected by compute_score(),
a different skt is being returned that is bound to the wrong vrf. The
difference between these and stream sockets is the handling of the skt
option for SO_REUSEPORT. While the handling when adding a skt for reuse
correctly checks that the bound device of the skt is a match, the skts
in the hashslot are already incorrect. So for the same hash, a skt for
the wrong vrf may be selected for the required port. The root cause is
that the skt is immediately placed into a slot when it is created,
but when the skt is then bound using SO_BINDTODEVICE, it remains in the
same slot. The solution is to move the skt to the correct slot by
forcing a rehash.

Signed-off-by: Mike Manning 
---
 include/net/udp.h   | 11 +++
 net/core/sock.c |  2 ++
 net/ipv4/udp.c  | 15 ++-
 net/ipv6/datagram.c |  5 -
 net/ipv6/udp.c  | 14 +-
 5 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/include/net/udp.h b/include/net/udp.h
index 9e82cb391dea..057972d0eea5 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -252,6 +252,17 @@ static inline int udp_rqueue_get(struct sock *sk)
return sk_rmem_alloc_get(sk) - READ_ONCE(udp_sk(sk)->forward_deficit);
 }
 
+static inline bool udp_sk_bound_dev_eq(struct net *net, int bound_dev_if,
+  int dif, int sdif)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+   return inet_bound_dev_eq(net->ipv4.sysctl_udp_l3mdev_accept,
+bound_dev_if, dif, sdif);
+#else
+   return inet_bound_dev_eq(1, bound_dev_if, dif, sdif);
+#endif
+}
+
 /* net/ipv4/udp.c */
 void udp_destruct_sock(struct sock *sk);
 void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len);
diff --git a/net/core/sock.c b/net/core/sock.c
index 6fcc4bc07d19..6eda848192aa 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -567,6 +567,8 @@ static int sock_setbindtodevice(struct sock *sk, char 
__user *optval,
 
lock_sock(sk);
sk->sk_bound_dev_if = index;
+   if (sk->sk_prot->rehash)
+   sk->sk_prot->rehash(sk);
sk_dst_reset(sk);
release_sock(sk);
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1976fddb9e00..cf73c9194bb6 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -371,6 +371,7 @@ static int compute_score(struct sock *sk, struct net *net,
 {
int score;
struct inet_sock *inet;
+   bool dev_match;
 
if (!net_eq(sock_net(sk), net) ||
udp_sk(sk)->udp_port_hash != hnum ||
@@ -398,15 +399,11 @@ static int compute_score(struct sock *sk, struct net *net,
score += 4;
}
 
-   if (sk->sk_bound_dev_if || exact_dif) {
-   bool dev_match = (sk->sk_bound_dev_if == dif ||
- sk->sk_bound_dev_if == sdif);
-
-   if (!dev_match)
-   return -1;
-   if (sk->sk_bound_dev_if)
-   score += 4;
-   }
+   dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+   dif, sdif);
+   if (!dev_match)
+   return -1;
+   score += 4;
 
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 1ede7a16a0be..4813293d4fad 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -782,7 +782,10 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
 
if (src_info->ipi6_ifindex) {
if (fl6->flowi6_oif &&
-   src_info->ipi6_ifindex != fl6->flowi6_oif)
+   src_info->ipi6_ifindex != fl6->flowi6_oif &&
+   (sk->sk_bound_dev_if != fl6->flowi6_oif ||
+!sk_dev_equal_l3scope(
+sk, src_info->ipi6_ifindex)))
return -EINVAL;
fl6->flowi6_oif = src_info->

[PATCH net-next v4 1/9] net: allow binding socket in a VRF when there's an unbound socket

2018-11-02 Thread Mike Manning

From: Robert Shearman 

Change the inet socket lookup to avoid packets arriving on a device
enslaved to an l3mdev from matching unbound sockets by removing the
wildcard for non sk_bound_dev_if and instead relying on check against
the secondary device index, which will be 0 when the input device is
not enslaved to an l3mdev and so match against an unbound socket and
not match when the input device is enslaved.

Change the socket binding to take the l3mdev into account to allow an
unbound socket to not conflict sockets bound to an l3mdev given the
datapath isolation now guaranteed.

Signed-off-by: Robert Shearman 
Signed-off-by: Mike Manning 
---
 Documentation/networking/vrf.txt |  9 +
 include/net/inet6_hashtables.h   |  5 ++---
 include/net/inet_hashtables.h| 13 ++---
 include/net/inet_sock.h  | 13 +
 net/ipv4/inet_connection_sock.c  | 13 ++---
 net/ipv4/inet_hashtables.c   | 20 +++-
 6 files changed, 51 insertions(+), 22 deletions(-)

diff --git a/Documentation/networking/vrf.txt b/Documentation/networking/vrf.txt
index 8ff7b4c8f91b..d4b129402d57 100644
--- a/Documentation/networking/vrf.txt
+++ b/Documentation/networking/vrf.txt
@@ -103,6 +103,11 @@ VRF device:
 
 or to specify the output device using cmsg and IP_PKTINFO.
 
+By default the scope of the port bindings for unbound sockets is
+limited to the default VRF. That is, it will not be matched by packets
+arriving on interfaces enslaved to an l3mdev and processes may bind to
+the same port if they bind to an l3mdev.
+
 TCP & UDP services running in the default VRF context (ie., not bound
 to any VRF device) can work across all VRF domains by enabling the
 tcp_l3mdev_accept and udp_l3mdev_accept sysctl options:
@@ -112,10 +117,6 @@ tcp_l3mdev_accept and udp_l3mdev_accept sysctl options:
 netfilter rules on the VRF device can be used to limit access to services
 running in the default VRF context as well.
 
-The default VRF does not have limited scope with respect to port bindings.
-That is, if a process does a wildcard bind to a port in the default VRF it
-owns the port across all VRF domains within the network namespace.
-
 

 
 Using iproute2 for VRFs
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index 6e91e38a31da..9db98af46985 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -115,9 +115,8 @@ int inet6_hash(struct sock *sk);
 ((__sk)->sk_family == AF_INET6)&&  \
 ipv6_addr_equal(&(__sk)->sk_v6_daddr, (__saddr))   &&  
\
 ipv6_addr_equal(&(__sk)->sk_v6_rcv_saddr, (__daddr))   &&  \
-(!(__sk)->sk_bound_dev_if  ||  \
-  ((__sk)->sk_bound_dev_if == (__dif)) ||  \
-  ((__sk)->sk_bound_dev_if == (__sdif)))   &&  \
+(((__sk)->sk_bound_dev_if == (__dif))  ||  \
+ ((__sk)->sk_bound_dev_if == (__sdif)))&&  \
 net_eq(sock_net(__sk), (__net)))
 
 #endif /* _INET6_HASHTABLES_H */
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 9141e95529e7..4ae060b4bac2 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -79,6 +79,7 @@ struct inet_ehash_bucket {
 
 struct inet_bind_bucket {
possible_net_t  ib_net;
+   int l3mdev;
unsigned short  port;
signed char fastreuse;
signed char fastreuseport;
@@ -191,7 +192,7 @@ static inline void inet_ehash_locks_free(struct 
inet_hashinfo *hashinfo)
 struct inet_bind_bucket *
 inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
struct inet_bind_hashbucket *head,
-   const unsigned short snum);
+   const unsigned short snum, int l3mdev);
 void inet_bind_bucket_destroy(struct kmem_cache *cachep,
  struct inet_bind_bucket *tb);
 
@@ -282,9 +283,8 @@ static inline struct sock *inet_lookup_listener(struct net 
*net,
 #define INET_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif, 
__sdif) \
(((__sk)->sk_portpair == (__ports)) &&  \
 ((__sk)->sk_addrpair == (__cookie))&&  \
-(!(__sk)->sk_bound_dev_if  ||  \
-  ((__sk)->sk_bound_dev_if == (__dif)) ||  \
-  ((__sk)->sk_bound_dev_if == (__sdif)))   &&  \
+(((__sk)->sk_bound_dev_if == (__dif))  ||  \
+ ((__sk)->sk_bound_dev_if == (__sdif)))&&  \

[PATCH] net: allow traceroute with a specified interface in a vrf

2018-10-26 Thread Mike Manning

Traceroute executed in a vrf succeeds if no device is given or if the
vrf is given as the device, but fails if the interface is given as the
device. This is for default UDP probes, it succeeds for TCP SYN or ICMP
ECHO probes. As the skb bound dev is the interface and the sk dev is
the vrf, sk lookup fails for ICMP_DEST_UNREACH and ICMP_TIME_EXCEEDED
messages. The solution is for the secondary dev to be passed so that
the interface is available for the device match to succeed, in the same
way as is already done for non-error cases.

Signed-off-by: Mike Manning 
---
 net/ipv4/udp.c | 4 ++--
 net/ipv6/udp.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1f5e78d1477d..c9bc08915153 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -676,8 +676,8 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct 
udp_table *udptable)
struct net *net = dev_net(skb->dev);
 
sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
-  iph->saddr, uh->source, skb->dev->ifindex, 0,
-  udptable, NULL);
+  iph->saddr, uh->source, skb->dev->ifindex,
+  inet_sdif(skb), udptable, NULL);
if (!sk) {
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
return; /* No socket for error */
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 4f0a8728d723..740be1fbd4f5 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -543,7 +543,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct 
inet6_skb_parm *opt,
struct net *net = dev_net(skb->dev);
 
sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source,
-  inet6_iif(skb), 0, udptable, skb);
+  inet6_iif(skb), inet6_sdif(skb), udptable, skb);
if (!sk) {
__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
  ICMP6_MIB_INERRORS);
-- 
2.11.0

[PATCH net-next v3 4/9] net: provide a sysctl raw_l3mdev_accept for raw socket lookup with VRFs

2018-10-04 Thread Mike Manning

Add a sysctl raw_l3mdev_accept to control raw socket lookup in a manner
similar to use of tcp_l3mdev_accept for stream and of udp_l3mdev_accept
for datagram sockets. Have this default to off as this is what users
expect, given that there is no explicit mechanism to set unmodified
VRF-unaware application into a default VRF.

Signed-off-by: Mike Manning 
---
 Documentation/networking/ip-sysctl.txt |  9 +
 Documentation/networking/vrf.txt   |  8 +---
 include/net/netns/ipv4.h   |  3 +++
 net/ipv4/sysctl_net_ipv4.c | 11 +++
 4 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index 8313a636dd53..a46be4a5b7a0 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -788,6 +788,15 @@ udp_wmem_min - INTEGER
total pages of UDP sockets exceed udp_mem pressure. The unit is byte.
Default: 4K
 
+RAW variables:
+
+raw_l3mdev_accept - BOOLEAN
+   Enabling this option allows a "global" bound socket to work
+   across L3 master domains (e.g., VRFs) with packets capable of
+   being received regardless of the L3 domain in which they
+   originated. Only valid when the kernel was compiled with
+   CONFIG_NET_L3_MASTER_DEV.
+
 CIPSOv4 Variables:
 
 cipso_cache_enable - BOOLEAN
diff --git a/Documentation/networking/vrf.txt b/Documentation/networking/vrf.txt
index d4b129402d57..deb798342f1e 100644
--- a/Documentation/networking/vrf.txt
+++ b/Documentation/networking/vrf.txt
@@ -108,11 +108,13 @@ limited to the default VRF. That is, it will not be 
matched by packets
 arriving on interfaces enslaved to an l3mdev and processes may bind to
 the same port if they bind to an l3mdev.
 
-TCP & UDP services running in the default VRF context (ie., not bound
-to any VRF device) can work across all VRF domains by enabling the
-tcp_l3mdev_accept and udp_l3mdev_accept sysctl options:
+TCP & UDP services & services using RAW sockets that are running in the
+default VRF context (ie., not bound to any VRF device) can work across
+all VRF domains by enabling the tcp_l3mdev_accept, udp_l3mdev_accept and
+raw_l3mdev_accept sysctl options:
 sysctl -w net.ipv4.tcp_l3mdev_accept=1
 sysctl -w net.ipv4.udp_l3mdev_accept=1
+sysctl -w net.ipv4.raw_l3mdev_accept=1
 
 netfilter rules on the VRF device can be used to limit access to services
 running in the default VRF context as well.
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index e47503b4e4d1..104a6669e344 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -103,6 +103,9 @@ struct netns_ipv4 {
/* Shall we try to damage output packets if routing dev changes? */
int sysctl_ip_dynaddr;
int sysctl_ip_early_demux;
+#ifdef CONFIG_NET_L3_MASTER_DEV
+   int sysctl_raw_l3mdev_accept;
+#endif
int sysctl_tcp_early_demux;
int sysctl_udp_early_demux;
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index b92f422f2fa8..d173337040ee 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -601,6 +601,17 @@ static struct ctl_table ipv4_net_table[] = {
.mode   = 0644,
.proc_handler   = ipv4_ping_group_range,
},
+#ifdef CONFIG_NET_L3_MASTER_DEV
+   {
+   .procname   = "raw_l3mdev_accept",
+   .data   = _net.ipv4.sysctl_raw_l3mdev_accept,
+   .maxlen = sizeof(int),
+   .mode   = 0644,
+   .proc_handler   = proc_dointvec_minmax,
+   .extra1 = ,
+   .extra2 = ,
+   },
+#endif
{
.procname   = "tcp_ecn",
.data   = _net.ipv4.sysctl_tcp_ecn,
-- 
2.11.0

[PATCH net-next v3 2/9] net: ensure unbound stream socket to be chosen when not in a VRF

2018-10-04 Thread Mike Manning

The commit a04a480d4392 ("net: Require exact match for TCP socket
lookups if dif is l3mdev") only ensures that the correct socket is
selected for packets in a VRF. However, there is no guarantee that
the unbound socket will be selected for packets when not in a VRF.
By checking for a device match in compute_score() also for the case
when there is no bound device and attaching a score to this, the
unbound socket is selected. And if a failure is returned when there
is no device match, this ensures that bound sockets are never selected,
even if there is no unbound socket.

Signed-off-by: Mike Manning 
---
 include/net/inet_hashtables.h | 11 +++
 include/net/inet_sock.h   |  8 
 net/ipv4/inet_hashtables.c| 14 ++
 net/ipv6/inet6_hashtables.c   | 14 ++
 4 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 4ae060b4bac2..5de2d9f24c05 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -189,6 +189,17 @@ static inline void inet_ehash_locks_free(struct 
inet_hashinfo *hashinfo)
hashinfo->ehash_locks = NULL;
 }
 
+static inline bool inet_sk_bound_dev_eq(struct net *net, int bound_dev_if,
+   int dif, int sdif)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+   return inet_bound_dev_eq(net->ipv4.sysctl_tcp_l3mdev_accept,
+   bound_dev_if, dif, sdif);
+#else
+   return inet_bound_dev_eq(1, bound_dev_if, dif, sdif);
+#endif
+}
+
 struct inet_bind_bucket *
 inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
struct inet_bind_hashbucket *head,
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 92e0aa3958f6..47c03ea989ad 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -143,6 +143,14 @@ static inline int inet_sk_bound_l3mdev(const struct sock 
*sk)
return 0;
 }
 
+static inline bool inet_bound_dev_eq(bool l3mdev_accept, int bound_dev_if,
+int dif, int sdif)
+{
+   if (!bound_dev_if)
+   return !sdif || l3mdev_accept;
+   return bound_dev_if == dif || bound_dev_if == sdif;
+}
+
 static inline struct ip_options_rcu *ireq_opt_deref(const struct 
inet_request_sock *ireq)
 {
return rcu_dereference_check(ireq->ireq_opt,
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 260531dc6458..2ec684057ebd 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -235,6 +235,7 @@ static inline int compute_score(struct sock *sk, struct net 
*net,
 {
int score = -1;
struct inet_sock *inet = inet_sk(sk);
+   bool dev_match;
 
if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
!ipv6_only_sock(sk)) {
@@ -245,15 +246,12 @@ static inline int compute_score(struct sock *sk, struct 
net *net,
return -1;
score += 4;
}
-   if (sk->sk_bound_dev_if || exact_dif) {
-   bool dev_match = (sk->sk_bound_dev_if == dif ||
- sk->sk_bound_dev_if == sdif);
+   dev_match = inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+dif, sdif);
+   if (!dev_match)
+   return -1;
+   score += 4;
 
-   if (!dev_match)
-   return -1;
-   if (sk->sk_bound_dev_if)
-   score += 4;
-   }
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
}
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 3d7c7460a0c5..5eeeba7181a1 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -99,6 +99,7 @@ static inline int compute_score(struct sock *sk, struct net 
*net,
const int dif, const int sdif, bool exact_dif)
 {
int score = -1;
+   bool dev_match;
 
if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum &&
sk->sk_family == PF_INET6) {
@@ -109,15 +110,12 @@ static inline int compute_score(struct sock *sk, struct 
net *net,
return -1;
score++;
}
-   if (sk->sk_bound_dev_if || exact_dif) {
-   bool dev_match = (sk->sk_bound_dev_if == dif ||
- sk->sk_bound_dev_if == sdif);
+   dev_match = inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+dif, sdif);
+   if (!dev_match)
+

[PATCH net-next v3 3/9] net: ensure unbound datagram socket to be chosen when not in a VRF

2018-10-04 Thread Mike Manning

Ensure an unbound datagram skt is chosen when not in a VRF. The check
for a device match in compute_score() for UDP must be performed when
there is no device match. For this, a failure is returned when there is
no device match. This ensures that bound sockets are never selected,
even if there is no unbound socket.

Allow IPv6 packets to be sent over a datagram skt bound to a VRF. These
packets are currently blocked, as flowi6_oif was set to that of the
master vrf device, and the ipi6_ifindex is that of the slave device.
Allow these packets to be sent by checking the device with ipi6_ifindex
has the same L3 scope as that of the bound device of the skt, which is
the master vrf device. Note that this check always succeeds if the skt
is unbound.

Even though the right datagram skt is now selected by compute_score(),
a different skt is being returned that is bound to the wrong vrf. The
difference between these and stream sockets is the handling of the skt
option for SO_REUSEPORT. While the handling when adding a skt for reuse
correctly checks that the bound device of the skt is a match, the skts
in the hashslot are already incorrect. So for the same hash, a skt for
the wrong vrf may be selected for the required port. The root cause is
that the skt is immediately placed into a slot when it is created,
but when the skt is then bound using SO_BINDTODEVICE, it remains in the
same slot. The solution is to move the skt to the correct slot by
forcing a rehash.

Signed-off-by: Mike Manning 
---
 include/net/udp.h   | 11 +++
 net/core/sock.c |  2 ++
 net/ipv4/udp.c  | 15 ++-
 net/ipv6/datagram.c |  5 -
 net/ipv6/udp.c  | 14 +-
 5 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/include/net/udp.h b/include/net/udp.h
index 8482a990b0bb..1e4fb8feaf50 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -252,6 +252,17 @@ static inline int udp_rqueue_get(struct sock *sk)
return sk_rmem_alloc_get(sk) - READ_ONCE(udp_sk(sk)->forward_deficit);
 }
 
+static inline bool udp_sk_bound_dev_eq(struct net *net, int bound_dev_if,
+  int dif, int sdif)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+   return inet_bound_dev_eq(net->ipv4.sysctl_udp_l3mdev_accept,
+bound_dev_if, dif, sdif);
+#else
+   return inet_bound_dev_eq(1, bound_dev_if, dif, sdif);
+#endif
+}
+
 /* net/ipv4/udp.c */
 void udp_destruct_sock(struct sock *sk);
 void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len);
diff --git a/net/core/sock.c b/net/core/sock.c
index 3730eb855095..da1cbb88a6bf 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -567,6 +567,8 @@ static int sock_setbindtodevice(struct sock *sk, char 
__user *optval,
 
lock_sock(sk);
sk->sk_bound_dev_if = index;
+   if (sk->sk_prot->rehash)
+   sk->sk_prot->rehash(sk);
sk_dst_reset(sk);
release_sock(sk);
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 3386b3b0218c..0559a7f4c83a 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -371,6 +371,7 @@ static int compute_score(struct sock *sk, struct net *net,
 {
int score;
struct inet_sock *inet;
+   bool dev_match;
 
if (!net_eq(sock_net(sk), net) ||
udp_sk(sk)->udp_port_hash != hnum ||
@@ -398,15 +399,11 @@ static int compute_score(struct sock *sk, struct net *net,
score += 4;
}
 
-   if (sk->sk_bound_dev_if || exact_dif) {
-   bool dev_match = (sk->sk_bound_dev_if == dif ||
- sk->sk_bound_dev_if == sdif);
-
-   if (!dev_match)
-   return -1;
-   if (sk->sk_bound_dev_if)
-   score += 4;
-   }
+   dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+   dif, sdif);
+   if (!dev_match)
+   return -1;
+   score += 4;
 
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 1ede7a16a0be..4813293d4fad 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -782,7 +782,10 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
 
if (src_info->ipi6_ifindex) {
if (fl6->flowi6_oif &&
-   src_info->ipi6_ifindex != fl6->flowi6_oif)
+   src_info->ipi6_ifindex != fl6->flowi6_oif &&
+   (sk->sk_bound_dev_if != fl6->flowi6_oif ||
+!sk_dev_equal_l3scope(
+sk, src_info->ipi6_ifindex)))
return -EINVAL;
fl6->flowi6_oif = src_info->

[PATCH net-next v3 5/9] net: fix raw socket lookup device bind matching with VRFs

2018-10-04 Thread Mike Manning

From: Duncan Eastoe 

When there exist a pair of raw sockets one unbound and one bound
to a VRF but equal in all other respects, when a packet is received
in the VRF context, __raw_v4_lookup() matches on both sockets.

This results in the packet being delivered over both sockets,
instead of only the raw socket bound to the VRF. The bound device
checks in __raw_v4_lookup() are replaced with a call to
raw_sk_bound_dev_eq() which correctly handles whether the packet
should be delivered over the unbound socket in such cases.

In __raw_v6_lookup() the match on the device binding of the socket is
similarly updated to use raw_sk_bound_dev_eq() which matches the
handling in __raw_v4_lookup().

Importantly raw_sk_bound_dev_eq() takes the raw_l3mdev_accept sysctl
into account.

Signed-off-by: Duncan Eastoe 
Signed-off-by: Mike Manning 
---
 include/net/raw.h | 12 
 net/ipv4/raw.c|  3 +--
 net/ipv6/raw.c|  5 ++---
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/include/net/raw.h b/include/net/raw.h
index 9c9fa98a91a4..ce88fdd68933 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -18,6 +18,7 @@
 #define _RAW_H
 
 
+#include 
 #include 
 #include 
 
@@ -74,4 +75,15 @@ static inline struct raw_sock *raw_sk(const struct sock *sk)
return (struct raw_sock *)sk;
 }
 
+static inline bool raw_sk_bound_dev_eq(struct net *net, int bound_dev_if,
+  int dif, int sdif)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+   return inet_bound_dev_eq(net->ipv4.sysctl_raw_l3mdev_accept,
+bound_dev_if, dif, sdif);
+#else
+   return inet_bound_dev_eq(1, bound_dev_if, dif, sdif);
+#endif
+}
+
 #endif /* _RAW_H */
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 8ca3eb06ba04..61f3559407a6 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -131,8 +131,7 @@ struct sock *__raw_v4_lookup(struct net *net, struct sock 
*sk,
if (net_eq(sock_net(sk), net) && inet->inet_num == num  &&
!(inet->inet_daddr && inet->inet_daddr != raddr)&&
!(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
-   !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif &&
- sk->sk_bound_dev_if != sdif))
+   raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
goto found; /* gotcha */
}
sk = NULL;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 413d98bf24f4..86978784fbb5 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -86,9 +86,8 @@ struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
!ipv6_addr_equal(>sk_v6_daddr, rmt_addr))
continue;
 
-   if (sk->sk_bound_dev_if &&
-   sk->sk_bound_dev_if != dif &&
-   sk->sk_bound_dev_if != sdif)
+   if (!raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+dif, sdif))
continue;
 
if (!ipv6_addr_any(>sk_v6_rcv_saddr)) {
-- 
2.11.0

[PATCH net-next v3 9/9] ipv6: do not drop vrf udp multicast packets

2018-10-04 Thread Mike Manning

From: Dewi Morgan 

For bound udp sockets in a vrf, also check the sdif to get the index
for ingress devices enslaved to an l3mdev.

Signed-off-by: Dewi Morgan 
Signed-off-by: Mike Manning 
---
 net/ipv6/udp.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 6722490c87b9..821fdc31dbc0 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -637,7 +637,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct 
sk_buff *skb)
 static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
   __be16 loc_port, const struct in6_addr 
*loc_addr,
   __be16 rmt_port, const struct in6_addr 
*rmt_addr,
-  int dif, unsigned short hnum)
+  int dif, int sdif, unsigned short hnum)
 {
struct inet_sock *inet = inet_sk(sk);
 
@@ -649,7 +649,7 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct 
sock *sk,
(inet->inet_dport && inet->inet_dport != rmt_port) ||
(!ipv6_addr_any(>sk_v6_daddr) &&
!ipv6_addr_equal(>sk_v6_daddr, rmt_addr)) ||
-   (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) ||
+   !udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif) ||
(!ipv6_addr_any(>sk_v6_rcv_saddr) &&
!ipv6_addr_equal(>sk_v6_rcv_saddr, loc_addr)))
return false;
@@ -683,6 +683,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct 
sk_buff *skb,
unsigned int offset = offsetof(typeof(*sk), sk_node);
unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
int dif = inet6_iif(skb);
+   int sdif = inet6_sdif(skb);
struct hlist_node *node;
struct sk_buff *nskb;
 
@@ -697,7 +698,8 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct 
sk_buff *skb,
 
sk_for_each_entry_offset_rcu(sk, node, >head, offset) {
if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr,
-   uh->source, saddr, dif, hnum))
+   uh->source, saddr, dif, sdif,
+   hnum))
continue;
/* If zero checksum and no_check is not on for
 * the socket then skip it.
-- 
2.11.0

[PATCH net-next v3 1/9] net: allow binding socket in a VRF when there's an unbound socket

2018-10-04 Thread Mike Manning

From: Robert Shearman 

Change the inet socket lookup to avoid packets arriving on a device
enslaved to an l3mdev from matching unbound sockets by removing the
wildcard for non sk_bound_dev_if and instead relying on check against
the secondary device index, which will be 0 when the input device is
not enslaved to an l3mdev and so match against an unbound socket and
not match when the input device is enslaved.

Change the socket binding to take the l3mdev into account to allow an
unbound socket to not conflict sockets bound to an l3mdev given the
datapath isolation now guaranteed.

Signed-off-by: Robert Shearman 
Signed-off-by: Mike Manning 
---
 Documentation/networking/vrf.txt |  9 +
 include/net/inet6_hashtables.h   |  5 ++---
 include/net/inet_hashtables.h| 13 ++---
 include/net/inet_sock.h  | 13 +
 net/ipv4/inet_connection_sock.c  | 13 ++---
 net/ipv4/inet_hashtables.c   | 20 +++-
 6 files changed, 51 insertions(+), 22 deletions(-)

diff --git a/Documentation/networking/vrf.txt b/Documentation/networking/vrf.txt
index 8ff7b4c8f91b..d4b129402d57 100644
--- a/Documentation/networking/vrf.txt
+++ b/Documentation/networking/vrf.txt
@@ -103,6 +103,11 @@ VRF device:
 
 or to specify the output device using cmsg and IP_PKTINFO.
 
+By default the scope of the port bindings for unbound sockets is
+limited to the default VRF. That is, it will not be matched by packets
+arriving on interfaces enslaved to an l3mdev and processes may bind to
+the same port if they bind to an l3mdev.
+
 TCP & UDP services running in the default VRF context (ie., not bound
 to any VRF device) can work across all VRF domains by enabling the
 tcp_l3mdev_accept and udp_l3mdev_accept sysctl options:
@@ -112,10 +117,6 @@ tcp_l3mdev_accept and udp_l3mdev_accept sysctl options:
 netfilter rules on the VRF device can be used to limit access to services
 running in the default VRF context as well.
 
-The default VRF does not have limited scope with respect to port bindings.
-That is, if a process does a wildcard bind to a port in the default VRF it
-owns the port across all VRF domains within the network namespace.
-
 

 
 Using iproute2 for VRFs
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index 6e91e38a31da..9db98af46985 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -115,9 +115,8 @@ int inet6_hash(struct sock *sk);
 ((__sk)->sk_family == AF_INET6)&&  \
 ipv6_addr_equal(&(__sk)->sk_v6_daddr, (__saddr))   &&  
\
 ipv6_addr_equal(&(__sk)->sk_v6_rcv_saddr, (__daddr))   &&  \
-(!(__sk)->sk_bound_dev_if  ||  \
-  ((__sk)->sk_bound_dev_if == (__dif)) ||  \
-  ((__sk)->sk_bound_dev_if == (__sdif)))   &&  \
+(((__sk)->sk_bound_dev_if == (__dif))  ||  \
+ ((__sk)->sk_bound_dev_if == (__sdif)))&&  \
 net_eq(sock_net(__sk), (__net)))
 
 #endif /* _INET6_HASHTABLES_H */
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 9141e95529e7..4ae060b4bac2 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -79,6 +79,7 @@ struct inet_ehash_bucket {
 
 struct inet_bind_bucket {
possible_net_t  ib_net;
+   int l3mdev;
unsigned short  port;
signed char fastreuse;
signed char fastreuseport;
@@ -191,7 +192,7 @@ static inline void inet_ehash_locks_free(struct 
inet_hashinfo *hashinfo)
 struct inet_bind_bucket *
 inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
struct inet_bind_hashbucket *head,
-   const unsigned short snum);
+   const unsigned short snum, int l3mdev);
 void inet_bind_bucket_destroy(struct kmem_cache *cachep,
  struct inet_bind_bucket *tb);
 
@@ -282,9 +283,8 @@ static inline struct sock *inet_lookup_listener(struct net 
*net,
 #define INET_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif, 
__sdif) \
(((__sk)->sk_portpair == (__ports)) &&  \
 ((__sk)->sk_addrpair == (__cookie))&&  \
-(!(__sk)->sk_bound_dev_if  ||  \
-  ((__sk)->sk_bound_dev_if == (__dif)) ||  \
-  ((__sk)->sk_bound_dev_if == (__sdif)))   &&  \
+(((__sk)->sk_bound_dev_if == (__dif))  ||  \
+ ((__sk)->sk_bound_dev_if == (__sdif)))&&  \

[PATCH net-next v3 8/9] ipv6: handling of multicast packets received in VRF

2018-10-04 Thread Mike Manning

If the skb for multicast packets marked as enslaved to a VRF are
received, then the secondary device index should be used to obtain
the real device. And verify the multicast address against the
enslaved rather than the l3mdev device.

Signed-off-by: Dewi Morgan 
Signed-off-by: Mike Manning 
---
 net/ipv6/ip6_input.c | 35 ---
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 96577e742afd..df58e1100226 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -359,6 +359,8 @@ static int ip6_input_finish(struct net *net, struct sock 
*sk, struct sk_buff *sk
}
} else if (ipprot->flags & INET6_PROTO_FINAL) {
const struct ipv6hdr *hdr;
+   int sdif = inet6_sdif(skb);
+   struct net_device *dev;
 
/* Only do this once for first final protocol */
have_final = true;
@@ -371,9 +373,19 @@ static int ip6_input_finish(struct net *net, struct sock 
*sk, struct sk_buff *sk
skb_postpull_rcsum(skb, skb_network_header(skb),
   skb_network_header_len(skb));
hdr = ipv6_hdr(skb);
+
+   /* skb->dev passed may be master dev for vrfs. */
+   if (sdif) {
+   dev = dev_get_by_index_rcu(net, sdif);
+   if (!dev)
+   goto discard;
+   } else {
+   dev = skb->dev;
+   }
+
if (ipv6_addr_is_multicast(>daddr) &&
-   !ipv6_chk_mcast_addr(skb->dev, >daddr,
-   >saddr) &&
+   !ipv6_chk_mcast_addr(dev, >daddr,
+>saddr) &&
!ipv6_is_mld(skb, nexthdr, 
skb_network_header_len(skb)))
goto discard;
}
@@ -432,15 +444,32 @@ EXPORT_SYMBOL_GPL(ip6_input);
 
 int ip6_mc_input(struct sk_buff *skb)
 {
+   int sdif = inet6_sdif(skb);
const struct ipv6hdr *hdr;
+   struct net_device *dev;
bool deliver;
 
__IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev),
 __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST,
 skb->len);
 
+   /* skb->dev passed may be master dev for vrfs. */
+   if (sdif) {
+   rcu_read_lock();
+   dev = dev_get_by_index_rcu(dev_net(skb->dev), sdif);
+   if (!dev) {
+   rcu_read_unlock();
+   kfree_skb(skb);
+   return -ENODEV;
+   }
+   } else {
+   dev = skb->dev;
+   }
+
hdr = ipv6_hdr(skb);
-   deliver = ipv6_chk_mcast_addr(skb->dev, >daddr, NULL);
+   deliver = ipv6_chk_mcast_addr(dev, >daddr, NULL);
+   if (sdif)
+   rcu_read_unlock();
 
 #ifdef CONFIG_IPV6_MROUTE
/*
-- 
2.11.0

[PATCH net-next v3 0/9] vrf: allow simultaneous service instances in default and other VRFs

2018-10-04 Thread Mike Manning

Services currently have to be VRF-aware if they are using an unbound
socket. One cannot have multiple service instances running in the
default and other VRFs for services that are not VRF-aware and listen
on an unbound socket. This is because there is no way of isolating
packets received in the default VRF from those arriving in other VRFs.

This series provides this isolation subject to the existing kernel
parameter net.ipv4.tcp_l3mdev_accept not being set, given that this is
documented as allowing a single service instance to work across all
VRF domains. The functionality applies to UDP & TCP services, for IPv4
and IPv6, in particular adding VRF table handling for IPv6 multicast.

Example of running ssh instances in default and blue VRF:

$ /usr/sbin/sshd -D
$ ip vrf exec vrf-blue /usr/sbin/sshd
$ ss -ta | egrep 'State|ssh'
State   Recv-Q   Send-Q   Local Address:Port   Peer Address:Port
LISTEN  0128   0.0.0.0%vrf-blue:ssh 0.0.0.0:*
LISTEN  01280.0.0.0:ssh 0.0.0.0:*
ESTAB   00  192.168.122.220:ssh   192.168.122.1:50282
LISTEN  0128  [::]%vrf-blue:ssh[::]:*
LISTEN  0128   [::]:ssh[::]:*
ESTAB   00   [3000::2]%vrf-blue:ssh   [3000::9]:45896
ESTAB   00[2000::2]:ssh   [2000::9]:46398

v1:
   - Address Paolo Abeni's comments (patch 4/5)
   - Fix build when CONFIG_NET_L3_MASTER_DEV not defined (patch 1/5)
v2:
   - Address David Aherns' comments (patches 4/5 and 5/5)
   - Remove patches 3/5 and 5/5 from series for individual submissions
   - Include a sysctl for raw sockets as recommended by David Ahern
   - Expand series into 10 patches and provide improved descriptions
v3:
   - Update description for patch 1/10 and remove patch 6/10

Dewi Morgan (1):
  ipv6: do not drop vrf udp multicast packets

Duncan Eastoe (1):
  net: fix raw socket lookup device bind matching with VRFs

Mike Manning (6):
  net: ensure unbound stream socket to be chosen when not in a VRF
  net: ensure unbound datagram socket to be chosen when not in a VRF
  net: provide a sysctl raw_l3mdev_accept for raw socket lookup with
VRFs
  vrf: mark skb for multicast or link-local as enslaved to VRF
  ipv6: allow ping to link-local address in VRF
  ipv6: handling of multicast packets received in VRF

Robert Shearman (1):
  net: allow binding socket in a VRF when there's an unbound socket

 Documentation/networking/ip-sysctl.txt |  9 +
 Documentation/networking/vrf.txt   | 17 ++---
 drivers/net/vrf.c  | 19 +-
 include/net/inet6_hashtables.h |  5 ++---
 include/net/inet_hashtables.h  | 24 ---
 include/net/inet_sock.h| 21 
 include/net/netns/ipv4.h   |  3 +++
 include/net/raw.h  | 12 
 include/net/udp.h  | 11 +++
 net/core/sock.c|  2 ++
 net/ipv4/inet_connection_sock.c| 13 ++---
 net/ipv4/inet_hashtables.c | 34 -
 net/ipv4/raw.c |  3 +--
 net/ipv4/sysctl_net_ipv4.c | 11 +++
 net/ipv4/udp.c | 15 ++-
 net/ipv6/datagram.c|  5 -
 net/ipv6/inet6_hashtables.c| 14 ++
 net/ipv6/ip6_input.c   | 35 +++---
 net/ipv6/ipv6_sockglue.c   |  2 +-
 net/ipv6/raw.c |  5 ++---
 net/ipv6/udp.c | 22 ++---
 21 files changed, 200 insertions(+), 82 deletions(-)

-- 
2.11.0

[PATCH net-next v3 7/9] ipv6: allow ping to link-local address in VRF

2018-10-04 Thread Mike Manning

If link-local packets are marked as enslaved to a VRF, then to allow
ping to the link-local from a vrf, the error handling for IPV6_PKTINFO
needs to be relaxed to also allow the pkt ipi6_ifindex to be that of a
slave device to the vrf.

Note that the real device also needs to be retrieved in icmp6_iif()
to set the ipv6 flow oif to this for icmp echo reply handling. The
recent commit 24b711edfc34 ("net/ipv6: Fix linklocal to global address
with VRF") takes care of this, so the sdif does not need checking here.

This fix makes ping to link-local consistent with that to global
addresses, in that this can now be done from within the same VRF that
the address is in.

Signed-off-by: Mike Manning 
---
 net/ipv6/ipv6_sockglue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index c0cac9cc3a28..f3e99e578843 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -486,7 +486,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, 
int optname,
retv = -EFAULT;
break;
}
-   if (sk->sk_bound_dev_if && pkt.ipi6_ifindex != 
sk->sk_bound_dev_if)
+   if (!sk_dev_equal_l3scope(sk, pkt.ipi6_ifindex))
goto e_inval;
 
np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex;
-- 
2.11.0

[PATCH net-next v3 6/9] vrf: mark skb for multicast or link-local as enslaved to VRF

2018-10-04 Thread Mike Manning

The skb for packets that are multicast or to a link-local address are
not marked as being enslaved to a VRF, if they are received on a socket
bound to the VRF. This is needed for ND and it is preferable for the
kernel not to have to deal with the additional use-cases if ll or mcast
packets are handled as enslaved. However, this does not allow service
instances listening on unbound and bound to VRF sockets to distinguish
the VRF used, if packets are sent as multicast or to a link-local
address. The fix is for the VRF driver to also mark these skb as being
enslaved to the VRF.

Signed-off-by: Mike Manning 
---
 drivers/net/vrf.c | 19 +--
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 69b7227c637e..21ad4b1d7f03 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -981,24 +981,23 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device 
*vrf_dev,
   struct sk_buff *skb)
 {
int orig_iif = skb->skb_iif;
-   bool need_strict;
+   bool need_strict = rt6_need_strict(_hdr(skb)->daddr);
+   bool is_ndisc = ipv6_ndisc_frame(skb);
 
-   /* loopback traffic; do not push through packet taps again.
-* Reset pkt_type for upper layers to process skb
+   /* loopback, multicast & non-ND link-local traffic; do not push through
+* packet taps again. Reset pkt_type for upper layers to process skb
 */
-   if (skb->pkt_type == PACKET_LOOPBACK) {
+   if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) {
skb->dev = vrf_dev;
skb->skb_iif = vrf_dev->ifindex;
IP6CB(skb)->flags |= IP6SKB_L3SLAVE;
-   skb->pkt_type = PACKET_HOST;
+   if (skb->pkt_type == PACKET_LOOPBACK)
+   skb->pkt_type = PACKET_HOST;
goto out;
}
 
-   /* if packet is NDISC or addressed to multicast or link-local
-* then keep the ingress interface
-*/
-   need_strict = rt6_need_strict(_hdr(skb)->daddr);
-   if (!ipv6_ndisc_frame(skb) && !need_strict) {
+   /* if packet is NDISC then keep the ingress interface */
+   if (!is_ndisc) {
vrf_rx_stats(vrf_dev, skb->len);
skb->dev = vrf_dev;
skb->skb_iif = vrf_dev->ifindex;
-- 
2.11.0

Re: [PATCH net] ipv6: revert degradation in IPv6 Ready Logo test results

2018-10-03 Thread Mike Manning

On 02/10/2018 19:26, David Miller wrote:
> From: Mike Manning 
> Date: Tue,  2 Oct 2018 12:40:30 +0100
>
>> This reverts commit 0ed4229b08c1 ("ipv6: defrag: drop non-last frags
>> smaller than min mtu"). While one should not get fragments smaller than
>> the IPv6 minimum MTU, not handling crafted packets in the TAHI IPv6
>> conformance test suite (v6eval) for IPv6 Ready Logo results in 18
>> failures representing over 5% of the score.
>>
>> Cc: Florian Westphal 
>> Signed-off-by: Mike Manning 
> Sorry, I'm not just going to blindly apply a patch because some
> TAHI tests fail.
>
> It's possible the TAHI tests are wrong, or that the specification
> elements it is testing don't make any sense these days.
>
> Allowing all kinds of random junk in the middle of the fragment queue
> leads to lots of unnecessary cpu overhead and potential bugs, and it
> triggerable remotely.

Understood, thank you.

It would be great if there is someone on this mailer who has influence
with ipv6ready.org so as to get the TAHI tests for IPv6 conformance
updated, as an upgrade to a kernel with the commit mentioned will result
in a 5% degradation in results for the existing tests.

[PATCH net] ipv6: revert degradation in IPv6 Ready Logo test results

2018-10-02 Thread Mike Manning

This reverts commit 0ed4229b08c1 ("ipv6: defrag: drop non-last frags
smaller than min mtu"). While one should not get fragments smaller than
the IPv6 minimum MTU, not handling crafted packets in the TAHI IPv6
conformance test suite (v6eval) for IPv6 Ready Logo results in 18
failures representing over 5% of the score.

Cc: Florian Westphal 
Signed-off-by: Mike Manning 
---
The failures which are reverted by this fix are:

Section 1: RFC 2460 - IPv6 Specification
Test v6LC.1.2.4: Extension Header Processing Order
33-34
Test v6LC.1.3.1: Fragment Reassembly
67-72
Test v6LC.1.3.2: Reassembly Time Exceeded
73-76
Test v6LC.1.3.3: Fragment Header M-Bit Set, Payload Length Invalid
78

Section 5: RFC 4443 - ICMPv6
Test v6LC.5.1.6: Erroneous Header Field (Parameter Problem Generation)
20 Erroneous Header Field (Parameter Problem Generation)
Test v6LC.5.1.10: Error Condition With Multicast Destination
31 Part B: Echo Request Reassembly Timeout
Test v6LC.5.1.11: Error Condition With Non-Unique Source - Unspecified
35 Part C: Echo Request Reassembly Timeout (Routers and Hosts)
Test v6LC.5.1.12: Error Condition With Non-Unique Source - Multicast
40 Part C: Echo Request Reassembly Timeout (Routers and Hosts)
Test v6LC.5.1.13: Error Condition With Non-Unique Source Anycast (Routers Only)
45 Part C: Echo Request Reassembly Timeout

 net/ipv6/netfilter/nf_conntrack_reasm.c | 4 
 net/ipv6/reassembly.c   | 4 
 2 files changed, 8 deletions(-)

diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c 
b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 8f68a518d9db..8c69c4fc78d8 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -559,10 +559,6 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff 
*skb, u32 user)
hdr = ipv6_hdr(skb);
fhdr = (struct frag_hdr *)skb_transport_header(skb);
 
-   if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU &&
-   fhdr->frag_off & htons(IP6_MF))
-   return -EINVAL;
-
skb_orphan(skb);
fq = fq_find(net, fhdr->identification, user, hdr,
 skb->dev ? skb->dev->ifindex : 0);
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 5c5b4f79296e..b4e558ab39fa 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -456,10 +456,6 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
return 1;
}
 
-   if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU &&
-   fhdr->frag_off & htons(IP6_MF))
-   goto fail_hdr;
-
iif = skb->dev ? skb->dev->ifindex : 0;
fq = fq_find(net, fhdr->identification, hdr, iif);
if (fq) {
-- 
2.11.0

Re: [PATCH net-next v1 1/5] net: allow binding socket in a VRF when there's an unbound socket

2018-10-01 Thread Mike Manning

On 25/09/2018 18:16, David Ahern wrote:
> On 9/25/18 9:26 AM, Mike Manning wrote:
>> On 24/09/2018 23:44, David Ahern wrote:
>>> On 9/24/18 10:13 AM, Mike Manning wrote:
>>>> From: Robert Shearman 
>>>>
>>>> There is no easy way currently for applications that want to receive
>>>> packets in the default VRF to be isolated from packets arriving in
>>>> VRFs, which makes using VRF-unaware applications in a VRF-aware system
>>>> a potential security risk.
>>> That comment is not correct.
>>>
>>> The point of the l3mdev sysctl's is to prohibit this case. Setting
>>> net.ipv4.{tcp,udp}_l3mdev_accept=0 means that a packet arriving on an
>>> interface enslaved to a VRF can not be received by a global socket.
>> Hi David, thanks for reviewing this. The converse does not hold though,
>> i.e. there is no guarantee that the unbound socket will be selected for
>> packets when not in a VRF, if there is an unbound socket and a socket
>> bound to a VRF. Also, such packets should not be handled by the socket
> I need an explicit example here. You are saying a packet arriving on an
> interface not enslaved to a VRF might match a socket bound to a VRF?
This problem occurs when different service instances are listening on an
unbound socket and sockets bound to VRFs respectively. Received packets
that are not in a VRF are not guaranteed to be handled by the unbound
socket.
>> in the VRF if there is no unbound socket. We also had an issue with raw
>> socket lookup device bind matching. I can break this particular patch
>> into smaller patches and provide more detail, would this help? I will
>> also update/break up the other patches according to your comments.
> Why not add an l3mdev sysctl for raw sockets then?
I have now added this, see patch 4/10.
> Yes, please send smaller patches. A diff stat of:
> 15 files changed, 109 insertions(+), 62 deletions(-)
> is a bit harsh.
I have removed the 2 patches you are ok with and have submitted them
separately , and have split the remaining 3 into 10 smaller patches.
>>> Setting the l3mdev to 1 allows the default socket to work across VRFs.
>>> If that is not what you want for a given app or a given VRF, then one
>>> option is to add netfilter rules on the VRF device to prohibit it. I
>>> just verified this works for both tcp and udp.
>> Netfilter is per application and so does not scale. I have not checked
>> if it is suitable for packet handling on raw sockets.
>>
>>> Further, overlapping binds are allowed using SO_REUSEPORT meaning I can
>>> have a server running in the default vrf bound to a port AND a server
>>> running bound to a specific vrf and the same port:
>>>
>>> udp    UNCONN 0  0  *%red:12345 *:*
>>>     users:(("vrf-test",pid=1376,fd=3))
>>> udp    UNCONN 0  0   *:12345 *:*
>>>  users:(("vrf-test",pid=1375,fd=3))
>>>
>>> tcp    LISTEN 0  1  *%red:12345 *:*
>>>     users:(("vrf-test",pid=1356,fd=3))
>>> tcp    LISTEN 0  1   *:12345 *:*
>>>  users:(("vrf-test",pid=1352,fd=3))
>>>
>>> For packets arriving on an interface enslaved to a VRF the socket lookup
>>> will pick the VRF server over the global one.
>> Agreed, but the converse is not guaranteed to hold i.e. packets that are
>> not in a VRF may be handled by a socket bound to a VRF.
>>
>> We do use SO_REUSEPORT for our own applications so as to run instances
>> in the default and other VRFs, but still require these patches (or
>> similar) due to how packets are handled when there is an unbound socket
>> and sockets bound to different VRFs.
> Why can't compute_score be adjusted to account for that case?
Yes, this is what we are doing. This is now in patch 2/10 for stream and
3/10 for datagram sockets, and see 5/10 for raw sockets.
>>> -- 
>>>
>>> With this patch set I am seeing a number of tests failing -- socket
>>> connections working when they should not or not working when they
>>> should. I only skimmed the results. I am guessing this patch is the
>>> reason, but that is just a guess.
>>>
>>> You need to make sure all permutations of:
>>> 1. net.ipv4.{tcp,udp}_l3mdev_accept={0,1},
>>> 2. connection in the default VRF and in a VRF,
>>> 3. locally originated and remote traffic,
>>> 4. ipv4 and ipv6
>>>
>> We are using raw, datagram and stream sockets for ipv4 & ipv6

[PATCH net-next v2 03/10] net: ensure unbound datagram socket to be chosen when not in a VRF

2018-10-01 Thread Mike Manning

Ensure an unbound datagram skt is chosen when not in a VRF. The check
for a device match in compute_score() for UDP must be performed when
there is no device match. For this, a failure is returned when there is
no device match. This ensures that bound sockets are never selected,
even if there is no unbound socket.

Allow IPv6 packets to be sent over a datagram skt bound to a VRF. These
packets are currently blocked, as flowi6_oif was set to that of the
master vrf device, and the ipi6_ifindex is that of the slave device.
Allow these packets to be sent by checking the device with ipi6_ifindex
has the same L3 scope as that of the bound device of the skt, which is
the master vrf device. Note that this check always succeeds if the skt
is unbound.

Even though the right datagram skt is now selected by compute_score(),
a different skt is being returned that is bound to the wrong vrf. The
difference between these and stream sockets is the handling of the skt
option for SO_REUSEPORT. While the handling when adding a skt for reuse
correctly checks that the bound device of the skt is a match, the skts
in the hashslot are already incorrect. So for the same hash, a skt for
the wrong vrf may be selected for the required port. The root cause is
that the skt is immediately placed into a slot when it is created,
but when the skt is then bound using SO_BINDTODEVICE, it remains in the
same slot. The solution is to move the skt to the correct slot by
forcing a rehash.

Signed-off-by: Mike Manning 
---
 include/net/udp.h   | 11 +++
 net/core/sock.c |  2 ++
 net/ipv4/udp.c  | 15 ++-
 net/ipv6/datagram.c |  5 -
 net/ipv6/udp.c  | 14 +-
 5 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/include/net/udp.h b/include/net/udp.h
index 8482a990b0bb..1e4fb8feaf50 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -252,6 +252,17 @@ static inline int udp_rqueue_get(struct sock *sk)
return sk_rmem_alloc_get(sk) - READ_ONCE(udp_sk(sk)->forward_deficit);
 }
 
+static inline bool udp_sk_bound_dev_eq(struct net *net, int bound_dev_if,
+  int dif, int sdif)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+   return inet_bound_dev_eq(net->ipv4.sysctl_udp_l3mdev_accept,
+bound_dev_if, dif, sdif);
+#else
+   return inet_bound_dev_eq(1, bound_dev_if, dif, sdif);
+#endif
+}
+
 /* net/ipv4/udp.c */
 void udp_destruct_sock(struct sock *sk);
 void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len);
diff --git a/net/core/sock.c b/net/core/sock.c
index 3730eb855095..da1cbb88a6bf 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -567,6 +567,8 @@ static int sock_setbindtodevice(struct sock *sk, char 
__user *optval,
 
lock_sock(sk);
sk->sk_bound_dev_if = index;
+   if (sk->sk_prot->rehash)
+   sk->sk_prot->rehash(sk);
sk_dst_reset(sk);
release_sock(sk);
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 3386b3b0218c..0559a7f4c83a 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -371,6 +371,7 @@ static int compute_score(struct sock *sk, struct net *net,
 {
int score;
struct inet_sock *inet;
+   bool dev_match;
 
if (!net_eq(sock_net(sk), net) ||
udp_sk(sk)->udp_port_hash != hnum ||
@@ -398,15 +399,11 @@ static int compute_score(struct sock *sk, struct net *net,
score += 4;
}
 
-   if (sk->sk_bound_dev_if || exact_dif) {
-   bool dev_match = (sk->sk_bound_dev_if == dif ||
- sk->sk_bound_dev_if == sdif);
-
-   if (!dev_match)
-   return -1;
-   if (sk->sk_bound_dev_if)
-   score += 4;
-   }
+   dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+   dif, sdif);
+   if (!dev_match)
+   return -1;
+   score += 4;
 
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 1ede7a16a0be..4813293d4fad 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -782,7 +782,10 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
 
if (src_info->ipi6_ifindex) {
if (fl6->flowi6_oif &&
-   src_info->ipi6_ifindex != fl6->flowi6_oif)
+   src_info->ipi6_ifindex != fl6->flowi6_oif &&
+   (sk->sk_bound_dev_if != fl6->flowi6_oif ||
+!sk_dev_equal_l3scope(
+sk, src_info->ipi6_ifindex)))
return -EINVAL;
fl6->flowi6_oif = src_info->

[PATCH net-next v2 01/10] net: allow binding socket in a VRF when there's an unbound socket

2018-10-01 Thread Mike Manning

From: Robert Shearman 

There is no easy way currently for applications that want to receive
packets in the default VRF to be isolated from packets arriving in
VRFs, which makes using VRF-unaware applications in a VRF-aware system
a potential security risk.

So change the inet socket lookup to avoid packets arriving on a device
enslaved to an l3mdev from matching unbound sockets by removing the
wildcard for non sk_bound_dev_if and instead relying on check against
the secondary device index, which will be 0 when the input device is
not enslaved to an l3mdev and so match against an unbound socket and
not match when the input device is enslaved.

Change the socket binding to take the l3mdev into account to allow an
unbound socket to not conflict sockets bound to an l3mdev given the
datapath isolation now guaranteed.

Signed-off-by: Robert Shearman 
Signed-off-by: Mike Manning 
---
 Documentation/networking/vrf.txt |  9 +
 include/net/inet6_hashtables.h   |  5 ++---
 include/net/inet_hashtables.h| 13 ++---
 include/net/inet_sock.h  | 13 +
 net/ipv4/inet_connection_sock.c  | 13 ++---
 net/ipv4/inet_hashtables.c   | 20 +++-
 6 files changed, 51 insertions(+), 22 deletions(-)

diff --git a/Documentation/networking/vrf.txt b/Documentation/networking/vrf.txt
index 8ff7b4c8f91b..d4b129402d57 100644
--- a/Documentation/networking/vrf.txt
+++ b/Documentation/networking/vrf.txt
@@ -103,6 +103,11 @@ VRF device:
 
 or to specify the output device using cmsg and IP_PKTINFO.
 
+By default the scope of the port bindings for unbound sockets is
+limited to the default VRF. That is, it will not be matched by packets
+arriving on interfaces enslaved to an l3mdev and processes may bind to
+the same port if they bind to an l3mdev.
+
 TCP & UDP services running in the default VRF context (ie., not bound
 to any VRF device) can work across all VRF domains by enabling the
 tcp_l3mdev_accept and udp_l3mdev_accept sysctl options:
@@ -112,10 +117,6 @@ tcp_l3mdev_accept and udp_l3mdev_accept sysctl options:
 netfilter rules on the VRF device can be used to limit access to services
 running in the default VRF context as well.
 
-The default VRF does not have limited scope with respect to port bindings.
-That is, if a process does a wildcard bind to a port in the default VRF it
-owns the port across all VRF domains within the network namespace.
-
 

 
 Using iproute2 for VRFs
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index 6e91e38a31da..9db98af46985 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -115,9 +115,8 @@ int inet6_hash(struct sock *sk);
 ((__sk)->sk_family == AF_INET6)&&  \
 ipv6_addr_equal(&(__sk)->sk_v6_daddr, (__saddr))   &&  
\
 ipv6_addr_equal(&(__sk)->sk_v6_rcv_saddr, (__daddr))   &&  \
-(!(__sk)->sk_bound_dev_if  ||  \
-  ((__sk)->sk_bound_dev_if == (__dif)) ||  \
-  ((__sk)->sk_bound_dev_if == (__sdif)))   &&  \
+(((__sk)->sk_bound_dev_if == (__dif))  ||  \
+ ((__sk)->sk_bound_dev_if == (__sdif)))&&  \
 net_eq(sock_net(__sk), (__net)))
 
 #endif /* _INET6_HASHTABLES_H */
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 9141e95529e7..4ae060b4bac2 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -79,6 +79,7 @@ struct inet_ehash_bucket {
 
 struct inet_bind_bucket {
possible_net_t  ib_net;
+   int l3mdev;
unsigned short  port;
signed char fastreuse;
signed char fastreuseport;
@@ -191,7 +192,7 @@ static inline void inet_ehash_locks_free(struct 
inet_hashinfo *hashinfo)
 struct inet_bind_bucket *
 inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
struct inet_bind_hashbucket *head,
-   const unsigned short snum);
+   const unsigned short snum, int l3mdev);
 void inet_bind_bucket_destroy(struct kmem_cache *cachep,
  struct inet_bind_bucket *tb);
 
@@ -282,9 +283,8 @@ static inline struct sock *inet_lookup_listener(struct net 
*net,
 #define INET_MATCH(__sk, __net, __cookie, __saddr, __daddr, __ports, __dif, 
__sdif) \
(((__sk)->sk_portpair == (__ports)) &&  \
 ((__sk)->sk_addrpair == (__cookie))&&  \
-(!(__sk)->sk_bound_dev_if  ||  \
-  ((__sk)->sk_bound_dev_if == (__dif)) ||  \
-

[PATCH net-next v2 08/10] ipv6: allow ping to link-local address in VRF

2018-10-01 Thread Mike Manning

If link-local packets are marked as enslaved to a VRF, then to allow
ping to the link-local from a vrf, the error handling for IPV6_PKTINFO
needs to be relaxed to also allow the pkt ipi6_ifindex to be that of a
slave device to the vrf.

Note that the real device also needs to be retrieved in icmp6_iif()
to set the ipv6 flow oif to this for icmp echo reply handling. The
recent commit 24b711edfc34 ("net/ipv6: Fix linklocal to global address
with VRF") takes care of this, so the sdif does not need checking here.

This fix makes ping to link-local consistent with that to global
addresses, in that this can now be done from within the same VRF that
the address is in.

Signed-off-by: Mike Manning 
---
 net/ipv6/ipv6_sockglue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 7dfbc797b130..4ebd395dd3df 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -486,7 +486,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, 
int optname,
retv = -EFAULT;
break;
}
-   if (sk->sk_bound_dev_if && pkt.ipi6_ifindex != 
sk->sk_bound_dev_if)
+   if (!sk_dev_equal_l3scope(sk, pkt.ipi6_ifindex))
goto e_inval;
 
np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex;
-- 
2.11.0

[PATCH net-next v2 07/10] vrf: mark skb for multicast or link-local as enslaved to VRF

2018-10-01 Thread Mike Manning

The skb for packets that are multicast or to a link-local address are
not marked as being enslaved to a VRF, if they are received on a socket
bound to the VRF. This is needed for ND and it is preferable for the
kernel not to have to deal with the additional use-cases if ll or mcast
packets are handled as enslaved. However, this does not allow service
instances listening on unbound and bound to VRF sockets to distinguish
the VRF used, if packets are sent as multicast or to a link-local
address. The fix is for the VRF driver to also mark these skb as being
enslaved to the VRF.

Signed-off-by: Mike Manning 
---
 drivers/net/vrf.c | 19 +--
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 69b7227c637e..21ad4b1d7f03 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -981,24 +981,23 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device 
*vrf_dev,
   struct sk_buff *skb)
 {
int orig_iif = skb->skb_iif;
-   bool need_strict;
+   bool need_strict = rt6_need_strict(_hdr(skb)->daddr);
+   bool is_ndisc = ipv6_ndisc_frame(skb);
 
-   /* loopback traffic; do not push through packet taps again.
-* Reset pkt_type for upper layers to process skb
+   /* loopback, multicast & non-ND link-local traffic; do not push through
+* packet taps again. Reset pkt_type for upper layers to process skb
 */
-   if (skb->pkt_type == PACKET_LOOPBACK) {
+   if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) {
skb->dev = vrf_dev;
skb->skb_iif = vrf_dev->ifindex;
IP6CB(skb)->flags |= IP6SKB_L3SLAVE;
-   skb->pkt_type = PACKET_HOST;
+   if (skb->pkt_type == PACKET_LOOPBACK)
+   skb->pkt_type = PACKET_HOST;
goto out;
}
 
-   /* if packet is NDISC or addressed to multicast or link-local
-* then keep the ingress interface
-*/
-   need_strict = rt6_need_strict(_hdr(skb)->daddr);
-   if (!ipv6_ndisc_frame(skb) && !need_strict) {
+   /* if packet is NDISC then keep the ingress interface */
+   if (!is_ndisc) {
vrf_rx_stats(vrf_dev, skb->len);
skb->dev = vrf_dev;
skb->skb_iif = vrf_dev->ifindex;
-- 
2.11.0

[PATCH net-next v2 04/10] net: provide a sysctl raw_l3mdev_accept for raw socket lookup with VRFs

2018-10-01 Thread Mike Manning

Add a sysctl raw_l3mdev_accept to control raw socket lookup in a manner
similar to use of tcp_l3mdev_accept for stream and of udp_l3mdev_accept
for datagram sockets. Have this default to off as this is what users
expect, given that there is no explicit mechanism to set unmodified
VRF-unaware application into a default VRF.

Signed-off-by: Mike Manning 
---
 Documentation/networking/ip-sysctl.txt |  9 +
 Documentation/networking/vrf.txt   |  8 +---
 include/net/netns/ipv4.h   |  3 +++
 net/ipv4/sysctl_net_ipv4.c | 11 +++
 4 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index 8313a636dd53..a46be4a5b7a0 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -788,6 +788,15 @@ udp_wmem_min - INTEGER
total pages of UDP sockets exceed udp_mem pressure. The unit is byte.
Default: 4K
 
+RAW variables:
+
+raw_l3mdev_accept - BOOLEAN
+   Enabling this option allows a "global" bound socket to work
+   across L3 master domains (e.g., VRFs) with packets capable of
+   being received regardless of the L3 domain in which they
+   originated. Only valid when the kernel was compiled with
+   CONFIG_NET_L3_MASTER_DEV.
+
 CIPSOv4 Variables:
 
 cipso_cache_enable - BOOLEAN
diff --git a/Documentation/networking/vrf.txt b/Documentation/networking/vrf.txt
index d4b129402d57..deb798342f1e 100644
--- a/Documentation/networking/vrf.txt
+++ b/Documentation/networking/vrf.txt
@@ -108,11 +108,13 @@ limited to the default VRF. That is, it will not be 
matched by packets
 arriving on interfaces enslaved to an l3mdev and processes may bind to
 the same port if they bind to an l3mdev.
 
-TCP & UDP services running in the default VRF context (ie., not bound
-to any VRF device) can work across all VRF domains by enabling the
-tcp_l3mdev_accept and udp_l3mdev_accept sysctl options:
+TCP & UDP services & services using RAW sockets that are running in the
+default VRF context (ie., not bound to any VRF device) can work across
+all VRF domains by enabling the tcp_l3mdev_accept, udp_l3mdev_accept and
+raw_l3mdev_accept sysctl options:
 sysctl -w net.ipv4.tcp_l3mdev_accept=1
 sysctl -w net.ipv4.udp_l3mdev_accept=1
+sysctl -w net.ipv4.raw_l3mdev_accept=1
 
 netfilter rules on the VRF device can be used to limit access to services
 running in the default VRF context as well.
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index e47503b4e4d1..104a6669e344 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -103,6 +103,9 @@ struct netns_ipv4 {
/* Shall we try to damage output packets if routing dev changes? */
int sysctl_ip_dynaddr;
int sysctl_ip_early_demux;
+#ifdef CONFIG_NET_L3_MASTER_DEV
+   int sysctl_raw_l3mdev_accept;
+#endif
int sysctl_tcp_early_demux;
int sysctl_udp_early_demux;
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index b92f422f2fa8..d173337040ee 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -601,6 +601,17 @@ static struct ctl_table ipv4_net_table[] = {
.mode   = 0644,
.proc_handler   = ipv4_ping_group_range,
},
+#ifdef CONFIG_NET_L3_MASTER_DEV
+   {
+   .procname   = "raw_l3mdev_accept",
+   .data   = _net.ipv4.sysctl_raw_l3mdev_accept,
+   .maxlen = sizeof(int),
+   .mode   = 0644,
+   .proc_handler   = proc_dointvec_minmax,
+   .extra1 = ,
+   .extra2 = ,
+   },
+#endif
{
.procname   = "tcp_ecn",
.data   = _net.ipv4.sysctl_tcp_ecn,
-- 
2.11.0

[PATCH net-next v2 10/10] ipv6: do not drop vrf udp multicast packets

2018-10-01 Thread Mike Manning

From: Dewi Morgan 

For bound udp sockets in a vrf, also check the sdif to get the index
for ingress devices enslaved to an l3mdev.

Signed-off-by: Dewi Morgan 
Signed-off-by: Mike Manning 
---
 net/ipv6/udp.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 6722490c87b9..821fdc31dbc0 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -637,7 +637,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct 
sk_buff *skb)
 static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
   __be16 loc_port, const struct in6_addr 
*loc_addr,
   __be16 rmt_port, const struct in6_addr 
*rmt_addr,
-  int dif, unsigned short hnum)
+  int dif, int sdif, unsigned short hnum)
 {
struct inet_sock *inet = inet_sk(sk);
 
@@ -649,7 +649,7 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct 
sock *sk,
(inet->inet_dport && inet->inet_dport != rmt_port) ||
(!ipv6_addr_any(>sk_v6_daddr) &&
!ipv6_addr_equal(>sk_v6_daddr, rmt_addr)) ||
-   (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) ||
+   !udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif) ||
(!ipv6_addr_any(>sk_v6_rcv_saddr) &&
!ipv6_addr_equal(>sk_v6_rcv_saddr, loc_addr)))
return false;
@@ -683,6 +683,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct 
sk_buff *skb,
unsigned int offset = offsetof(typeof(*sk), sk_node);
unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
int dif = inet6_iif(skb);
+   int sdif = inet6_sdif(skb);
struct hlist_node *node;
struct sk_buff *nskb;
 
@@ -697,7 +698,8 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct 
sk_buff *skb,
 
sk_for_each_entry_offset_rcu(sk, node, >head, offset) {
if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr,
-   uh->source, saddr, dif, hnum))
+   uh->source, saddr, dif, sdif,
+   hnum))
continue;
/* If zero checksum and no_check is not on for
 * the socket then skip it.
-- 
2.11.0

[PATCH net-next v2 05/10] net: fix raw socket lookup device bind matching with VRFs

2018-10-01 Thread Mike Manning

From: Duncan Eastoe 

When there exist a pair of raw sockets one unbound and one bound
to a VRF but equal in all other respects, when a packet is received
in the VRF context, __raw_v4_lookup() matches on both sockets.

This results in the packet being delivered over both sockets,
instead of only the raw socket bound to the VRF. The bound device
checks in __raw_v4_lookup() are replaced with a call to
raw_sk_bound_dev_eq() which correctly handles whether the packet
should be delivered over the unbound socket in such cases.

In __raw_v6_lookup() the match on the device binding of the socket is
similarly updated to use raw_sk_bound_dev_eq() which matches the
handling in __raw_v4_lookup().

Importantly raw_sk_bound_dev_eq() takes the raw_l3mdev_accept sysctl
into account.

Signed-off-by: Duncan Eastoe 
Signed-off-by: Mike Manning 
---
 include/net/raw.h | 12 
 net/ipv4/raw.c|  4 ++--
 net/ipv6/raw.c|  6 +++---
 3 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/include/net/raw.h b/include/net/raw.h
index 9c9fa98a91a4..ce88fdd68933 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -18,6 +18,7 @@
 #define _RAW_H
 
 
+#include 
 #include 
 #include 
 
@@ -74,4 +75,15 @@ static inline struct raw_sock *raw_sk(const struct sock *sk)
return (struct raw_sock *)sk;
 }
 
+static inline bool raw_sk_bound_dev_eq(struct net *net, int bound_dev_if,
+  int dif, int sdif)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+   return inet_bound_dev_eq(net->ipv4.sysctl_raw_l3mdev_accept,
+bound_dev_if, dif, sdif);
+#else
+   return inet_bound_dev_eq(1, bound_dev_if, dif, sdif);
+#endif
+}
+
 #endif /* _RAW_H */
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 8ca3eb06ba04..6d8006c86dc0 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -70,6 +70,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -131,8 +132,7 @@ struct sock *__raw_v4_lookup(struct net *net, struct sock 
*sk,
if (net_eq(sock_net(sk), net) && inet->inet_num == num  &&
!(inet->inet_daddr && inet->inet_daddr != raddr)&&
!(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
-   !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif &&
- sk->sk_bound_dev_if != sdif))
+   raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
goto found; /* gotcha */
}
sk = NULL;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 413d98bf24f4..5b363d315b06 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -49,6 +49,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #if IS_ENABLED(CONFIG_IPV6_MIP6)
 #include 
@@ -86,9 +87,8 @@ struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
!ipv6_addr_equal(>sk_v6_daddr, rmt_addr))
continue;
 
-   if (sk->sk_bound_dev_if &&
-   sk->sk_bound_dev_if != dif &&
-   sk->sk_bound_dev_if != sdif)
+   if (!raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+dif, sdif))
continue;
 
if (!ipv6_addr_any(>sk_v6_rcv_saddr)) {
-- 
2.11.0

[PATCH net-next v2 06/10] net: IP[V6]_MULTICAST_IF constraint on unbound socket if VRFs present

2018-10-01 Thread Mike Manning

From: Duncan Eastoe 

If setsockopt(IP_MULTICAST_IF) or setsockopt(IPV6_MULTICAST_IF) is
called on a socket which is not bound to a VRF then we should ensure
that the output device chosen is also not bound to a VRF master.

This avoids inadvertently sending traffic out of the wrong interface.
This can be particularly problematic for IP_MULTICAST_IF since the
interface lookup can be performed by address as well as ifindex. If
there are interfaces with the same address, one unbound and one bound
to a VRF, then the interface bound to the VRF may be chosen when the
sockopt is called on an unbound socket.

Signed-off-by: Duncan Eastoe 
Signed-off-by: Mike Manning 
---
 net/ipv4/ip_sockglue.c   | 3 +++
 net/ipv6/ipv6_sockglue.c | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index c0fe5ad996f2..026971314c43 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -892,6 +892,9 @@ static int do_ip_setsockopt(struct sock *sk, int level,
dev_put(dev);
 
err = -EINVAL;
+   if (!sk->sk_bound_dev_if && midx)
+   break;
+
if (sk->sk_bound_dev_if &&
mreq.imr_ifindex != sk->sk_bound_dev_if &&
(!midx || midx != sk->sk_bound_dev_if))
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index c0cac9cc3a28..7dfbc797b130 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -626,6 +626,9 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, 
int optname,
 
rcu_read_unlock();
 
+   if (!sk->sk_bound_dev_if && midx)
+   goto e_inval;
+
if (sk->sk_bound_dev_if &&
sk->sk_bound_dev_if != val &&
(!midx || midx != sk->sk_bound_dev_if))
-- 
2.11.0

[PATCH net-next v2 09/10] ipv6: handling of multicast packets received in VRF

2018-10-01 Thread Mike Manning

If the skb for multicast packets marked as enslaved to a VRF are
received, then the secondary device index should be used to obtain
the real device. And verify the multicast address against the
enslaved rather than the l3mdev device.

Signed-off-by: Dewi Morgan 
Signed-off-by: Mike Manning 
---
 net/ipv6/ip6_input.c | 35 ---
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 96577e742afd..df58e1100226 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -359,6 +359,8 @@ static int ip6_input_finish(struct net *net, struct sock 
*sk, struct sk_buff *sk
}
} else if (ipprot->flags & INET6_PROTO_FINAL) {
const struct ipv6hdr *hdr;
+   int sdif = inet6_sdif(skb);
+   struct net_device *dev;
 
/* Only do this once for first final protocol */
have_final = true;
@@ -371,9 +373,19 @@ static int ip6_input_finish(struct net *net, struct sock 
*sk, struct sk_buff *sk
skb_postpull_rcsum(skb, skb_network_header(skb),
   skb_network_header_len(skb));
hdr = ipv6_hdr(skb);
+
+   /* skb->dev passed may be master dev for vrfs. */
+   if (sdif) {
+   dev = dev_get_by_index_rcu(net, sdif);
+   if (!dev)
+   goto discard;
+   } else {
+   dev = skb->dev;
+   }
+
if (ipv6_addr_is_multicast(>daddr) &&
-   !ipv6_chk_mcast_addr(skb->dev, >daddr,
-   >saddr) &&
+   !ipv6_chk_mcast_addr(dev, >daddr,
+>saddr) &&
!ipv6_is_mld(skb, nexthdr, 
skb_network_header_len(skb)))
goto discard;
}
@@ -432,15 +444,32 @@ EXPORT_SYMBOL_GPL(ip6_input);
 
 int ip6_mc_input(struct sk_buff *skb)
 {
+   int sdif = inet6_sdif(skb);
const struct ipv6hdr *hdr;
+   struct net_device *dev;
bool deliver;
 
__IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev),
 __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST,
 skb->len);
 
+   /* skb->dev passed may be master dev for vrfs. */
+   if (sdif) {
+   rcu_read_lock();
+   dev = dev_get_by_index_rcu(dev_net(skb->dev), sdif);
+   if (!dev) {
+   rcu_read_unlock();
+   kfree_skb(skb);
+   return -ENODEV;
+   }
+   } else {
+   dev = skb->dev;
+   }
+
hdr = ipv6_hdr(skb);
-   deliver = ipv6_chk_mcast_addr(skb->dev, >daddr, NULL);
+   deliver = ipv6_chk_mcast_addr(dev, >daddr, NULL);
+   if (sdif)
+   rcu_read_unlock();
 
 #ifdef CONFIG_IPV6_MROUTE
/*
-- 
2.11.0

[PATCH net-next v2 00/10] vrf: allow simultaneous service instances in default and other VRFs

2018-10-01 Thread Mike Manning

Services currently have to be VRF-aware if they are using an unbound
socket. One cannot have multiple service instances running in the
default and other VRFs for services that are not VRF-aware and listen
on an unbound socket. This is because there is no way of isolating
packets received in the default VRF from those arriving in other VRFs.

This series provides this isolation subject to the existing kernel
parameter net.ipv4.tcp_l3mdev_accept not being set, given that this is
documented as allowing a single service instance to work across all
VRF domains. The functionality applies to UDP & TCP services, for IPv4
and IPv6, in particular adding VRF table handling for IPv6 multicast.

Example of running ssh instances in default and blue VRF:

$ /usr/sbin/sshd -D
$ ip vrf exec vrf-blue /usr/sbin/sshd
$ ss -ta | egrep 'State|ssh'
State   Recv-Q   Send-Q   Local Address:Port   Peer Address:Port
LISTEN  0128   0.0.0.0%vrf-blue:ssh 0.0.0.0:*
LISTEN  01280.0.0.0:ssh 0.0.0.0:*
ESTAB   00  192.168.122.220:ssh   192.168.122.1:50282
LISTEN  0128  [::]%vrf-blue:ssh[::]:*
LISTEN  0128   [::]:ssh[::]:*
ESTAB   00   [3000::2]%vrf-blue:ssh   [3000::9]:45896
ESTAB   00[2000::2]:ssh   [2000::9]:46398

v1:
   - Address Paolo Abeni's comments (patch 4/5)
   - Fix build when CONFIG_NET_L3_MASTER_DEV not defined (patch 1/5)
v2:
   - Address David Aherns' comments (patches 4/5 and 5/5)
   - Remove patches 3/5 and 5/5 from series for individual submissions
   - Include a sysctl for raw sockets as recommended by David Ahern
   - Expand series into 10 patches and provide improved descriptions

Dewi Morgan (1):
  ipv6: do not drop vrf udp multicast packets

Duncan Eastoe (2):
  net: fix raw socket lookup device bind matching with VRFs
  net: IP[V6]_MULTICAST_IF constraint on unbound socket if VRFs present

Mike Manning (6):
  net: ensure unbound stream socket to be chosen when not in a VRF
  net: ensure unbound datagram socket to be chosen when not in a VRF
  net: provide a sysctl raw_l3mdev_accept for raw socket lookup with
VRFs
  vrf: mark skb for multicast or link-local as enslaved to VRF
  ipv6: allow ping to link-local address in VRF
  ipv6: handling of multicast packets received in VRF

Robert Shearman (1):
  net: allow binding socket in a VRF when there's an unbound socket

 Documentation/networking/ip-sysctl.txt |  9 +
 Documentation/networking/vrf.txt   | 17 ++---
 drivers/net/vrf.c  | 19 +-
 include/net/inet6_hashtables.h |  5 ++---
 include/net/inet_hashtables.h  | 24 ---
 include/net/inet_sock.h| 21 
 include/net/netns/ipv4.h   |  3 +++
 include/net/raw.h  | 12 
 include/net/udp.h  | 11 +++
 net/core/sock.c|  2 ++
 net/ipv4/inet_connection_sock.c| 13 ++---
 net/ipv4/inet_hashtables.c | 34 -
 net/ipv4/ip_sockglue.c |  3 +++
 net/ipv4/raw.c |  4 ++--
 net/ipv4/sysctl_net_ipv4.c | 11 +++
 net/ipv4/udp.c | 15 ++-
 net/ipv6/datagram.c|  5 -
 net/ipv6/inet6_hashtables.c| 14 ++
 net/ipv6/ip6_input.c   | 35 +++---
 net/ipv6/ipv6_sockglue.c   |  5 -
 net/ipv6/raw.c |  6 +++---
 net/ipv6/udp.c | 22 ++---
 22 files changed, 208 insertions(+), 82 deletions(-)

-- 
2.11.0

[PATCH net-next v2 02/10] net: ensure unbound stream socket to be chosen when not in a VRF

2018-10-01 Thread Mike Manning

The commit a04a480d4392 ("net: Require exact match for TCP socket
lookups if dif is l3mdev") only ensures that the correct socket is
selected for packets in a VRF. However, there is no guarantee that
the unbound socket will be selected for packets when not in a VRF.
By checking for a device match in compute_score() also for the case
when there is no bound device and attaching a score to this, the
unbound socket is selected. And if a failure is returned when there
is no device match, this ensures that bound sockets are never selected,
even if there is no unbound socket.

Signed-off-by: Mike Manning 
---
 include/net/inet_hashtables.h | 11 +++
 include/net/inet_sock.h   |  8 
 net/ipv4/inet_hashtables.c| 14 ++
 net/ipv6/inet6_hashtables.c   | 14 ++
 4 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 4ae060b4bac2..5de2d9f24c05 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -189,6 +189,17 @@ static inline void inet_ehash_locks_free(struct 
inet_hashinfo *hashinfo)
hashinfo->ehash_locks = NULL;
 }
 
+static inline bool inet_sk_bound_dev_eq(struct net *net, int bound_dev_if,
+   int dif, int sdif)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+   return inet_bound_dev_eq(net->ipv4.sysctl_tcp_l3mdev_accept,
+   bound_dev_if, dif, sdif);
+#else
+   return inet_bound_dev_eq(1, bound_dev_if, dif, sdif);
+#endif
+}
+
 struct inet_bind_bucket *
 inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
struct inet_bind_hashbucket *head,
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 92e0aa3958f6..47c03ea989ad 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -143,6 +143,14 @@ static inline int inet_sk_bound_l3mdev(const struct sock 
*sk)
return 0;
 }
 
+static inline bool inet_bound_dev_eq(bool l3mdev_accept, int bound_dev_if,
+int dif, int sdif)
+{
+   if (!bound_dev_if)
+   return !sdif || l3mdev_accept;
+   return bound_dev_if == dif || bound_dev_if == sdif;
+}
+
 static inline struct ip_options_rcu *ireq_opt_deref(const struct 
inet_request_sock *ireq)
 {
return rcu_dereference_check(ireq->ireq_opt,
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 260531dc6458..2ec684057ebd 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -235,6 +235,7 @@ static inline int compute_score(struct sock *sk, struct net 
*net,
 {
int score = -1;
struct inet_sock *inet = inet_sk(sk);
+   bool dev_match;
 
if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
!ipv6_only_sock(sk)) {
@@ -245,15 +246,12 @@ static inline int compute_score(struct sock *sk, struct 
net *net,
return -1;
score += 4;
}
-   if (sk->sk_bound_dev_if || exact_dif) {
-   bool dev_match = (sk->sk_bound_dev_if == dif ||
- sk->sk_bound_dev_if == sdif);
+   dev_match = inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+dif, sdif);
+   if (!dev_match)
+   return -1;
+   score += 4;
 
-   if (!dev_match)
-   return -1;
-   if (sk->sk_bound_dev_if)
-   score += 4;
-   }
if (sk->sk_incoming_cpu == raw_smp_processor_id())
score++;
}
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 3d7c7460a0c5..5eeeba7181a1 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -99,6 +99,7 @@ static inline int compute_score(struct sock *sk, struct net 
*net,
const int dif, const int sdif, bool exact_dif)
 {
int score = -1;
+   bool dev_match;
 
if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum &&
sk->sk_family == PF_INET6) {
@@ -109,15 +110,12 @@ static inline int compute_score(struct sock *sk, struct 
net *net,
return -1;
score++;
}
-   if (sk->sk_bound_dev_if || exact_dif) {
-   bool dev_match = (sk->sk_bound_dev_if == dif ||
- sk->sk_bound_dev_if == sdif);
+   dev_match = inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+dif, sdif);
+   if (!dev_match)
+

[PATCH net-next] ipv6: add vrf table handling code for ipv6 mcast

2018-10-01 Thread Mike Manning

From: Patrick Ruddy 

The code to obtain the correct table for the incoming interface was
missing for IPv6. This has been added along with the table creation
notification to fib rules for the RTNL_FAMILY_IP6MR address family.

Signed-off-by: Patrick Ruddy 
Signed-off-by: Mike Manning 
---
 drivers/net/vrf.c | 11 +++
 net/ipv6/ip6mr.c  | 48 
 2 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index f93547f257fb..69b7227c637e 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -1215,8 +1215,19 @@ static int vrf_add_fib_rules(const struct net_device 
*dev)
goto ipmr_err;
 #endif
 
+#if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES)
+   err = vrf_fib_rule(dev, RTNL_FAMILY_IP6MR, true);
+   if (err < 0)
+   goto ip6mr_err;
+#endif
+
return 0;
 
+#if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES)
+ip6mr_err:
+   vrf_fib_rule(dev, RTNL_FAMILY_IPMR,  false);
+#endif
+
 #if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES)
 ipmr_err:
vrf_fib_rule(dev, AF_INET6,  false);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index d0b7e0249c13..6f07b8380425 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -85,7 +85,8 @@ static struct mr_table *ip6mr_new_table(struct net *net, u32 
id);
 static void ip6mr_free_table(struct mr_table *mrt);
 
 static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
-  struct sk_buff *skb, struct mfc6_cache *cache);
+  struct net_device *dev, struct sk_buff *skb,
+  struct mfc6_cache *cache);
 static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
  mifi_t mifi, int assert);
 static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc,
@@ -138,6 +139,9 @@ static int ip6mr_fib_lookup(struct net *net, struct flowi6 
*flp6,
.flags = FIB_LOOKUP_NOREF,
};
 
+   /* update flow if oif or iif point to device enslaved to l3mdev */
+   l3mdev_update_flow(net, flowi6_to_flowi(flp6));
+
err = fib_rules_lookup(net->ipv6.mr6_rules_ops,
   flowi6_to_flowi(flp6), 0, );
if (err < 0)
@@ -164,7 +168,9 @@ static int ip6mr_rule_action(struct fib_rule *rule, struct 
flowi *flp,
return -EINVAL;
}
 
-   mrt = ip6mr_get_table(rule->fr_net, rule->table);
+   arg->table = fib_rule_get_table(rule, arg);
+
+   mrt = ip6mr_get_table(rule->fr_net, arg->table);
if (!mrt)
return -EAGAIN;
res->mrt = mrt;
@@ -1014,7 +1020,7 @@ static void ip6mr_cache_resolve(struct net *net, struct 
mr_table *mrt,
}
rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
} else
-   ip6_mr_forward(net, mrt, skb, c);
+   ip6_mr_forward(net, mrt, skb->dev, skb, c);
}
 }
 
@@ -1120,7 +1126,7 @@ static int ip6mr_cache_report(struct mr_table *mrt, 
struct sk_buff *pkt,
 
 /* Queue a packet for resolution. It gets locked cache entry! */
 static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi,
- struct sk_buff *skb)
+ struct sk_buff *skb, struct net_device *dev)
 {
struct mfc6_cache *c;
bool found = false;
@@ -1180,6 +1186,10 @@ static int ip6mr_cache_unresolved(struct mr_table *mrt, 
mifi_t mifi,
kfree_skb(skb);
err = -ENOBUFS;
} else {
+   if (dev) {
+   skb->dev = dev;
+   skb->skb_iif = dev->ifindex;
+   }
skb_queue_tail(>_c.mfc_un.unres.unresolved, skb);
err = 0;
}
@@ -2043,11 +2053,12 @@ static int ip6mr_find_vif(struct mr_table *mrt, struct 
net_device *dev)
 }
 
 static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
-  struct sk_buff *skb, struct mfc6_cache *c)
+  struct net_device *dev, struct sk_buff *skb,
+  struct mfc6_cache *c)
 {
int psend = -1;
int vif, ct;
-   int true_vifi = ip6mr_find_vif(mrt, skb->dev);
+   int true_vifi = ip6mr_find_vif(mrt, dev);
 
vif = c->_c.mfc_parent;
c->_c.mfc_un.res.pkt++;
@@ -2073,7 +2084,7 @@ static void ip6_mr_forward(struct net *net, struct 
mr_table *mrt,
/*
 * Wrong interface: drop packet and (maybe) send PIM assert.
 */
-   if (mrt->vif_table[vif].dev != skb->dev) {
+   if (mrt->vif_table[vif].dev != dev) {
c->_c.mfc_un.res.wrong_if++;
 
if (true_vifi >= 0 && mrt->mroute_do_assert &&
@@ -2154,6 +2165,19 @@ int ip6_mr_input(struct sk_buff *skb)

[PATCH net-next] ipv4: Allow sending multicast packets on specific i/f using VRF socket

2018-10-01 Thread Mike Manning

From: Robert Shearman 

It is useful to be able to use the same socket for listening in a
specific VRF, as for sending multicast packets out of a specific
interface. However, the bound device on the socket currently takes
precedence and results in the packets not being sent.

Relax the condition on overriding the output interface to use for
sending packets out of UDP, raw and ping sockets to allow multicast
packets to be sent using the specified multicast interface.

Signed-off-by: Robert Shearman 
Signed-off-by: Mike Manning 
---
 net/ipv4/datagram.c | 2 +-
 net/ipv4/ping.c | 2 +-
 net/ipv4/raw.c  | 2 +-
 net/ipv4/udp.c  | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index f915abff1350..300921417f89 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -42,7 +42,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr 
*uaddr, int addr_len
oif = sk->sk_bound_dev_if;
saddr = inet->inet_saddr;
if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
-   if (!oif)
+   if (!oif || netif_index_is_l3_master(sock_net(sk), oif))
oif = inet->mc_index;
if (!saddr)
saddr = inet->mc_addr;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 8d7aaf118a30..7ccb5f87f70b 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -779,7 +779,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr 
*msg, size_t len)
}
 
if (ipv4_is_multicast(daddr)) {
-   if (!ipc.oif)
+   if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
ipc.oif = inet->mc_index;
if (!saddr)
saddr = inet->mc_addr;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 33df4d76db2d..8ca3eb06ba04 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -608,7 +608,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, 
size_t len)
tos |= RTO_ONLINK;
 
if (ipv4_is_multicast(daddr)) {
-   if (!ipc.oif)
+   if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
ipc.oif = inet->mc_index;
if (!saddr)
saddr = inet->mc_addr;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index f4e35b2ff8b8..3386b3b0218c 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1042,7 +1042,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, 
size_t len)
}
 
if (ipv4_is_multicast(daddr)) {
-   if (!ipc.oif)
+   if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
ipc.oif = inet->mc_index;
if (!saddr)
saddr = inet->mc_addr;
-- 
2.11.0

Re: [PATCH net-next v1 1/5] net: allow binding socket in a VRF when there's an unbound socket

2018-09-25 Thread Mike Manning


On 24/09/2018 23:44, David Ahern wrote:

On 9/24/18 10:13 AM, Mike Manning wrote:

From: Robert Shearman 

There is no easy way currently for applications that want to receive
packets in the default VRF to be isolated from packets arriving in
VRFs, which makes using VRF-unaware applications in a VRF-aware system
a potential security risk.


That comment is not correct.

The point of the l3mdev sysctl's is to prohibit this case. Setting
net.ipv4.{tcp,udp}_l3mdev_accept=0 means that a packet arriving on an
interface enslaved to a VRF can not be received by a global socket.
Hi David, thanks for reviewing this. The converse does not hold though, 
i.e. there is no guarantee that the unbound socket will be selected for 
packets when not in a VRF, if there is an unbound socket and a socket 
bound to a VRF. Also, such packets should not be handled by the socket 
in the VRF if there is no unbound socket. We also had an issue with raw 
socket lookup device bind matching. I can break this particular patch 
into smaller patches and provide more detail, would this help? I will 
also update/break up the other patches according to your comments.




Setting the l3mdev to 1 allows the default socket to work across VRFs.
If that is not what you want for a given app or a given VRF, then one
option is to add netfilter rules on the VRF device to prohibit it. I
just verified this works for both tcp and udp.


Netfilter is per application and so does not scale. I have not checked 
if it is suitable for packet handling on raw sockets.




Further, overlapping binds are allowed using SO_REUSEPORT meaning I can
have a server running in the default vrf bound to a port AND a server
running bound to a specific vrf and the same port:

udpUNCONN 0  0  *%red:12345 *:*
users:(("vrf-test",pid=1376,fd=3))
udpUNCONN 0  0   *:12345 *:*
 users:(("vrf-test",pid=1375,fd=3))

tcpLISTEN 0  1  *%red:12345 *:*
users:(("vrf-test",pid=1356,fd=3))
tcpLISTEN 0  1   *:12345 *:*
 users:(("vrf-test",pid=1352,fd=3))

For packets arriving on an interface enslaved to a VRF the socket lookup
will pick the VRF server over the global one.


Agreed, but the converse is not guaranteed to hold i.e. packets that are 
not in a VRF may be handled by a socket bound to a VRF.


We do use SO_REUSEPORT for our own applications so as to run instances 
in the default and other VRFs, but still require these patches (or 
similar) due to how packets are handled when there is an unbound socket 
and sockets bound to different VRFs.




--

With this patch set I am seeing a number of tests failing -- socket
connections working when they should not or not working when they
should. I only skimmed the results. I am guessing this patch is the
reason, but that is just a guess.

You need to make sure all permutations of:
1. net.ipv4.{tcp,udp}_l3mdev_accept={0,1},
2. connection in the default VRF and in a VRF,
3. locally originated and remote traffic,
4. ipv4 and ipv6



We are using raw, datagram and stream sockets for ipv4 & ipv6, require 
connectivity for local and remote addresses where appropriate and need 
route leaking between VRFs when configured, we are unaware of any 
outstanding bugs. Is there some way that I can run/analyze the tests 
that are failing for you?


Also cf patch 2/5 note that ping to link-local addresses is handled 
consistently with that to global addresses in a VRF, so this now 
succeeds if ping is done in the VRF, i.e. 'sudo ip vrf exec  ping 
 -I 



continue to work as expected meaning packets flow when they should and
fail with the right error when they should not. I believe the UDP cases
were the main ones failing.

Given the test failures, I did not look at the code changes in the patch.

[PATCH net-next v1 1/5] net: allow binding socket in a VRF when there's an unbound socket

2018-09-24 Thread Mike Manning

From: Robert Shearman 

There is no easy way currently for applications that want to receive
packets in the default VRF to be isolated from packets arriving in
VRFs, which makes using VRF-unaware applications in a VRF-aware system
a potential security risk.

So change the inet socket lookup to avoid packets arriving on a device
enslaved to an l3mdev from matching unbound sockets by removing the
wildcard for non sk_bound_dev_if and instead relying on check against
the secondary device index, which will be 0 when the input device is
not enslaved to an l3mdev and so match against an unbound socket and
not match when the input device is enslaved.

The existing net.ipv4.tcp_l3mdev_accept & net.ipv4.udp_l3mdev_accept
sysctls, which are documented as allowing the working across all VRF
domains, can be used to also work in the default VRF by causing
unbound sockets to match against packets arriving on a device
enslaved to an l3mdev.

Change the socket binding to take the l3mdev into account to allow an
unbound socket to not conflict sockets bound to an l3mdev given the
datapath isolation now guaranteed.

Signed-off-by: Robert Shearman 
Signed-off-by: Mike Manning 
---
 Documentation/networking/vrf.txt |  9 +
 include/net/inet6_hashtables.h   |  5 ++---
 include/net/inet_hashtables.h| 31 ---
 include/net/inet_sock.h  | 13 +
 net/core/sock.c  |  2 ++
 net/ipv4/inet_connection_sock.c  | 13 ++---
 net/ipv4/inet_hashtables.c   | 34 +-
 net/ipv4/ip_sockglue.c   |  3 +++
 net/ipv4/raw.c   |  4 ++--
 net/ipv4/udp.c   | 15 ++-
 net/ipv6/datagram.c  |  5 -
 net/ipv6/inet6_hashtables.c  | 14 ++
 net/ipv6/ipv6_sockglue.c |  3 +++
 net/ipv6/raw.c   |  6 +++---
 net/ipv6/udp.c   | 14 +-
 15 files changed, 109 insertions(+), 62 deletions(-)

diff --git a/Documentation/networking/vrf.txt b/Documentation/networking/vrf.txt
index 8ff7b4c8f91b..d4b129402d57 100644
--- a/Documentation/networking/vrf.txt
+++ b/Documentation/networking/vrf.txt
@@ -103,6 +103,11 @@ VRF device:
 
 or to specify the output device using cmsg and IP_PKTINFO.
 
+By default the scope of the port bindings for unbound sockets is
+limited to the default VRF. That is, it will not be matched by packets
+arriving on interfaces enslaved to an l3mdev and processes may bind to
+the same port if they bind to an l3mdev.
+
 TCP & UDP services running in the default VRF context (ie., not bound
 to any VRF device) can work across all VRF domains by enabling the
 tcp_l3mdev_accept and udp_l3mdev_accept sysctl options:
@@ -112,10 +117,6 @@ tcp_l3mdev_accept and udp_l3mdev_accept sysctl options:
 netfilter rules on the VRF device can be used to limit access to services
 running in the default VRF context as well.
 
-The default VRF does not have limited scope with respect to port bindings.
-That is, if a process does a wildcard bind to a port in the default VRF it
-owns the port across all VRF domains within the network namespace.
-
 

 
 Using iproute2 for VRFs
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index 6e91e38a31da..9db98af46985 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -115,9 +115,8 @@ int inet6_hash(struct sock *sk);
 ((__sk)->sk_family == AF_INET6)&&  \
 ipv6_addr_equal(&(__sk)->sk_v6_daddr, (__saddr))   &&  
\
 ipv6_addr_equal(&(__sk)->sk_v6_rcv_saddr, (__daddr))   &&  \
-(!(__sk)->sk_bound_dev_if  ||  \
-  ((__sk)->sk_bound_dev_if == (__dif)) ||  \
-  ((__sk)->sk_bound_dev_if == (__sdif)))   &&  \
+(((__sk)->sk_bound_dev_if == (__dif))  ||  \
+ ((__sk)->sk_bound_dev_if == (__sdif)))&&  \
 net_eq(sock_net(__sk), (__net)))
 
 #endif /* _INET6_HASHTABLES_H */
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 9141e95529e7..866efd35ded4 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -79,6 +79,7 @@ struct inet_ehash_bucket {
 
 struct inet_bind_bucket {
possible_net_t  ib_net;
+   int l3mdev;
unsigned short  port;
signed char fastreuse;
signed char fastreuseport;
@@ -188,10 +189,28 @@ static inline void inet_ehash_locks_free(struct 
inet_hashinfo *hashinfo)
hashinfo->ehash_locks = NULL;
 }
 
+#ifdef CONFIG_NET_L3_MASTER_DEV
+static inline bool inet_sk_bound_dev_eq(struct ne

[PATCH net-next v1 0/5] vrf: allow simultaneous service instances in default and other VRFs

2018-09-24 Thread Mike Manning

Services currently have to be VRF-aware if they are using an unbound
socket. One cannot have multiple service instances running in the
default and other VRFs for services that are not VRF-aware and listen
on an unbound socket. This is because there is no way of isolating
packets received in the default VRF from those arriving in other VRFs.

This series provides this isolation subject to the existing kernel
parameter net.ipv4.tcp_l3mdev_accept not being set, given that this is
documented as allowing a single service instance to work across all
VRF domains. The functionality applies to UDP & TCP services, for IPv4
and IPv6, in particular adding VRF table handling for IPv6 multicast.

Example of running ssh instances in default and blue VRF:

$ /usr/sbin/sshd -D
$ ip vrf exec vrf-blue /usr/sbin/sshd
$ ss -ta | egrep 'State|ssh'
State   Recv-Q   Send-Q   Local Address:Port   Peer Address:Port
LISTEN  0128   0.0.0.0%vrf-blue:ssh 0.0.0.0:*
LISTEN  01280.0.0.0:ssh 0.0.0.0:*
ESTAB   00  192.168.122.220:ssh   192.168.122.1:50282
LISTEN  0128  [::]%vrf-blue:ssh[::]:*
LISTEN  0128   [::]:ssh[::]:*
ESTAB   00   [3000::2]%vrf-blue:ssh   [3000::9]:45896
ESTAB   00[2000::2]:ssh   [2000::9]:46398

v1:
   - Address Paolo Abeni's comments (patch 4/5)
   - Fix build when CONFIG_NET_L3_MASTER_DEV not defined (patch 1/5)

Dewi Morgan (1):
  ipv6: do not drop vrf udp multicast packets

Mike Manning (1):
  ipv6: allow link-local and multicast packets inside vrf

Patrick Ruddy (1):
  ipv6: add vrf table handling code for ipv6 mcast

Robert Shearman (2):
  net: allow binding socket in a VRF when there's an unbound socket
  ipv4: Allow sending multicast packets on specific i/f using VRF socket

 Documentation/networking/vrf.txt |  9 
 drivers/net/vrf.c| 30 
 include/net/inet6_hashtables.h   |  5 ++--
 include/net/inet_hashtables.h| 31 +++--
 include/net/inet_sock.h  | 13 +++
 net/core/sock.c  |  2 ++
 net/ipv4/datagram.c  |  2 +-
 net/ipv4/inet_connection_sock.c  | 13 ---
 net/ipv4/inet_hashtables.c   | 34 +---
 net/ipv4/ip_sockglue.c   |  3 +++
 net/ipv4/ping.c  |  2 +-
 net/ipv4/raw.c   |  6 ++---
 net/ipv4/udp.c   | 17 ++
 net/ipv6/datagram.c  |  5 +++-
 net/ipv6/inet6_hashtables.c  | 14 +---
 net/ipv6/ip6_input.c | 43 +++
 net/ipv6/ip6mr.c | 49 ++--
 net/ipv6/ipv6_sockglue.c |  5 +++-
 net/ipv6/raw.c   |  6 ++---
 net/ipv6/udp.c   | 22 --
 20 files changed, 214 insertions(+), 97 deletions(-)
-- 
2.11.0

[PATCH net-next v1 3/5] ipv4: Allow sending multicast packets on specific i/f using VRF socket

2018-09-24 Thread Mike Manning

From: Robert Shearman 

It is useful to be able to use the same socket for listening in a
specific VRF, as for sending multicast packets out of a specific
interface. However, the bound device on the socket currently takes
precedence and results in the packets not being sent.

Relax the condition on overriding the output interface to use for
sending packets out of UDP, raw and ping sockets to allow multicast
packets to be sent using the specified multicast interface.

Signed-off-by: Robert Shearman 
Signed-off-by: Mike Manning 
---
 net/ipv4/datagram.c | 2 +-
 net/ipv4/ping.c | 2 +-
 net/ipv4/raw.c  | 2 +-
 net/ipv4/udp.c  | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index f915abff1350..300921417f89 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -42,7 +42,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr 
*uaddr, int addr_len
oif = sk->sk_bound_dev_if;
saddr = inet->inet_saddr;
if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
-   if (!oif)
+   if (!oif || netif_index_is_l3_master(sock_net(sk), oif))
oif = inet->mc_index;
if (!saddr)
saddr = inet->mc_addr;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 8d7aaf118a30..7ccb5f87f70b 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -779,7 +779,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr 
*msg, size_t len)
}
 
if (ipv4_is_multicast(daddr)) {
-   if (!ipc.oif)
+   if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
ipc.oif = inet->mc_index;
if (!saddr)
saddr = inet->mc_addr;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 8a0d568d7aec..c55ef53d87a8 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -608,7 +608,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, 
size_t len)
tos |= RTO_ONLINK;
 
if (ipv4_is_multicast(daddr)) {
-   if (!ipc.oif)
+   if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
ipc.oif = inet->mc_index;
if (!saddr)
saddr = inet->mc_addr;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 3d59ab47a85d..f81097843031 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1039,7 +1039,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, 
size_t len)
}
 
if (ipv4_is_multicast(daddr)) {
-   if (!ipc.oif)
+   if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
ipc.oif = inet->mc_index;
if (!saddr)
saddr = inet->mc_addr;
-- 
2.11.0

[PATCH net-next v1 2/5] ipv6: allow link-local and multicast packets inside vrf

2018-09-24 Thread Mike Manning

Packets that are multicast or to link-local addresses are not enslaved
to the vrf of the socket that they are received on. This is needed for
NDISC, but breaks applications that rely on receiving such packets when
in a VRF. Also to make IPv6 consistent with IPv4 which does handle
multicast packets as being enslaved, modify the VRF driver to do the
same for IPv6. As a result, the multicast address check needs to verify
the address against the enslaved rather than the l3mdev device.

Signed-off-by: Mike Manning 
---
 drivers/net/vrf.c| 19 +--
 net/ipv6/ip6_input.c | 19 ++-
 net/ipv6/ipv6_sockglue.c |  2 +-
 3 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index f93547f257fb..9d817c19f3b4 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -981,24 +981,23 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device 
*vrf_dev,
   struct sk_buff *skb)
 {
int orig_iif = skb->skb_iif;
-   bool need_strict;
+   bool need_strict = rt6_need_strict(_hdr(skb)->daddr);
+   bool is_ndisc = ipv6_ndisc_frame(skb);
 
-   /* loopback traffic; do not push through packet taps again.
-* Reset pkt_type for upper layers to process skb
+   /* loopback, multicast & non-ND link-local traffic; do not push through
+* packet taps again. Reset pkt_type for upper layers to process skb
 */
-   if (skb->pkt_type == PACKET_LOOPBACK) {
+   if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) {
skb->dev = vrf_dev;
skb->skb_iif = vrf_dev->ifindex;
IP6CB(skb)->flags |= IP6SKB_L3SLAVE;
-   skb->pkt_type = PACKET_HOST;
+   if (skb->pkt_type == PACKET_LOOPBACK)
+   skb->pkt_type = PACKET_HOST;
goto out;
}
 
-   /* if packet is NDISC or addressed to multicast or link-local
-* then keep the ingress interface
-*/
-   need_strict = rt6_need_strict(_hdr(skb)->daddr);
-   if (!ipv6_ndisc_frame(skb) && !need_strict) {
+   /* if packet is NDISC then keep the ingress interface */
+   if (!is_ndisc) {
vrf_rx_stats(vrf_dev, skb->len);
skb->dev = vrf_dev;
skb->skb_iif = vrf_dev->ifindex;
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 96577e742afd..108f5f88ec98 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -432,15 +432,32 @@ EXPORT_SYMBOL_GPL(ip6_input);
 
 int ip6_mc_input(struct sk_buff *skb)
 {
+   int sdif = inet6_sdif(skb);
const struct ipv6hdr *hdr;
+   struct net_device *dev;
bool deliver;
 
__IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev),
 __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST,
 skb->len);
 
+   /* skb->dev passed may be master dev for vrfs. */
+   if (sdif) {
+   rcu_read_lock();
+   dev = dev_get_by_index_rcu(dev_net(skb->dev), sdif);
+   if (!dev) {
+   rcu_read_unlock();
+   kfree_skb(skb);
+   return -ENODEV;
+   }
+   } else {
+   dev = skb->dev;
+   }
+
hdr = ipv6_hdr(skb);
-   deliver = ipv6_chk_mcast_addr(skb->dev, >daddr, NULL);
+   deliver = ipv6_chk_mcast_addr(dev, >daddr, NULL);
+   if (sdif)
+   rcu_read_unlock();
 
 #ifdef CONFIG_IPV6_MROUTE
/*
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 7dfbc797b130..4ebd395dd3df 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -486,7 +486,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, 
int optname,
retv = -EFAULT;
break;
}
-   if (sk->sk_bound_dev_if && pkt.ipi6_ifindex != 
sk->sk_bound_dev_if)
+   if (!sk_dev_equal_l3scope(sk, pkt.ipi6_ifindex))
goto e_inval;
 
np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex;
-- 
2.11.0

[PATCH net-next v1 4/5] ipv6: do not drop vrf udp multicast packets

2018-09-24 Thread Mike Manning

From: Dewi Morgan 

For bound udp sockets in a vrf, also check the sdif to get the index
for ingress devices enslaved to an l3mdev. Verify the multicast address
against the enslaved rather than the l3mdev device.

Signed-off-by: Dewi Morgan 
Signed-off-by: Mike Manning 
---
 net/ipv6/ip6_input.c | 24 
 net/ipv6/udp.c   |  8 +---
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 108f5f88ec98..82ffb5cdd2ab 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -324,11 +324,14 @@ void ipv6_list_rcv(struct list_head *head, struct 
packet_type *pt,
 static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff 
*skb)
 {
const struct inet6_protocol *ipprot;
+   int sdif = inet6_sdif(skb);
+   bool have_final = false;
struct inet6_dev *idev;
+   struct net_device *dev;
unsigned int nhoff;
+   bool deliver;
int nexthdr;
bool raw;
-   bool have_final = false;
 
/*
 *  Parse extension headers
@@ -371,9 +374,22 @@ static int ip6_input_finish(struct net *net, struct sock 
*sk, struct sk_buff *sk
skb_postpull_rcsum(skb, skb_network_header(skb),
   skb_network_header_len(skb));
hdr = ipv6_hdr(skb);
-   if (ipv6_addr_is_multicast(>daddr) &&
-   !ipv6_chk_mcast_addr(skb->dev, >daddr,
-   >saddr) &&
+
+   /* skb->dev passed may be master dev for vrfs. */
+   if (sdif) {
+   dev = dev_get_by_index_rcu(dev_net(skb->dev),
+  sdif);
+   if (!dev) {
+   kfree_skb(skb);
+   return -ENODEV;
+   }
+   } else {
+   dev = skb->dev;
+   }
+
+   deliver = ipv6_chk_mcast_addr(dev, >daddr,
+ >saddr);
+   if (ipv6_addr_is_multicast(>daddr) && !deliver &&
!ipv6_is_mld(skb, nexthdr, 
skb_network_header_len(skb)))
goto discard;
}
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index e22b7dd78c9b..35f71b7a1070 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -637,7 +637,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct 
sk_buff *skb)
 static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
   __be16 loc_port, const struct in6_addr 
*loc_addr,
   __be16 rmt_port, const struct in6_addr 
*rmt_addr,
-  int dif, unsigned short hnum)
+  int dif, int sdif, unsigned short hnum)
 {
struct inet_sock *inet = inet_sk(sk);
 
@@ -649,7 +649,7 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct 
sock *sk,
(inet->inet_dport && inet->inet_dport != rmt_port) ||
(!ipv6_addr_any(>sk_v6_daddr) &&
!ipv6_addr_equal(>sk_v6_daddr, rmt_addr)) ||
-   (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) ||
+   !inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif) ||
(!ipv6_addr_any(>sk_v6_rcv_saddr) &&
!ipv6_addr_equal(>sk_v6_rcv_saddr, loc_addr)))
return false;
@@ -683,6 +683,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct 
sk_buff *skb,
unsigned int offset = offsetof(typeof(*sk), sk_node);
unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
int dif = inet6_iif(skb);
+   int sdif = inet6_sdif(skb);
struct hlist_node *node;
struct sk_buff *nskb;
 
@@ -697,7 +698,8 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct 
sk_buff *skb,
 
sk_for_each_entry_offset_rcu(sk, node, >head, offset) {
if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr,
-   uh->source, saddr, dif, hnum))
+   uh->source, saddr, dif, sdif,
+   hnum))
continue;
/* If zero checksum and no_check is not on for
 * the socket then skip it.
-- 
2.11.0

[PATCH net-next v1 5/5] ipv6: add vrf table handling code for ipv6 mcast

2018-09-24 Thread Mike Manning

From: Patrick Ruddy 

The code to obtain the correct table for the incoming interface was
missing for IPv6. This has been added along with the table creation
notification to fib rules for the RTNL_FAMILY_IP6MR address family.

Signed-off-by: Patrick Ruddy 
Signed-off-by: Mike Manning 
---
 drivers/net/vrf.c | 11 +++
 net/ipv6/ip6mr.c  | 49 +
 2 files changed, 48 insertions(+), 12 deletions(-)

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 9d817c19f3b4..21ad4b1d7f03 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -1214,8 +1214,19 @@ static int vrf_add_fib_rules(const struct net_device 
*dev)
goto ipmr_err;
 #endif
 
+#if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES)
+   err = vrf_fib_rule(dev, RTNL_FAMILY_IP6MR, true);
+   if (err < 0)
+   goto ip6mr_err;
+#endif
+
return 0;
 
+#if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES)
+ip6mr_err:
+   vrf_fib_rule(dev, RTNL_FAMILY_IPMR,  false);
+#endif
+
 #if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES)
 ipmr_err:
vrf_fib_rule(dev, AF_INET6,  false);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index d0b7e0249c13..1ecc88456dc5 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -85,7 +85,8 @@ static struct mr_table *ip6mr_new_table(struct net *net, u32 
id);
 static void ip6mr_free_table(struct mr_table *mrt);
 
 static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
-  struct sk_buff *skb, struct mfc6_cache *cache);
+  struct net_device *dev, struct sk_buff *skb,
+  struct mfc6_cache *cache);
 static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
  mifi_t mifi, int assert);
 static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc,
@@ -138,6 +139,9 @@ static int ip6mr_fib_lookup(struct net *net, struct flowi6 
*flp6,
.flags = FIB_LOOKUP_NOREF,
};
 
+   /* update flow if oif or iif point to device enslaved to l3mdev */
+   l3mdev_update_flow(net, flowi6_to_flowi(flp6));
+
err = fib_rules_lookup(net->ipv6.mr6_rules_ops,
   flowi6_to_flowi(flp6), 0, );
if (err < 0)
@@ -164,7 +168,9 @@ static int ip6mr_rule_action(struct fib_rule *rule, struct 
flowi *flp,
return -EINVAL;
}
 
-   mrt = ip6mr_get_table(rule->fr_net, rule->table);
+   arg->table = fib_rule_get_table(rule, arg);
+
+   mrt = ip6mr_get_table(rule->fr_net, arg->table);
if (!mrt)
return -EAGAIN;
res->mrt = mrt;
@@ -1014,7 +1020,7 @@ static void ip6mr_cache_resolve(struct net *net, struct 
mr_table *mrt,
}
rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
} else
-   ip6_mr_forward(net, mrt, skb, c);
+   ip6_mr_forward(net, mrt, skb->dev, skb, c);
}
 }
 
@@ -1120,7 +1126,7 @@ static int ip6mr_cache_report(struct mr_table *mrt, 
struct sk_buff *pkt,
 
 /* Queue a packet for resolution. It gets locked cache entry! */
 static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi,
- struct sk_buff *skb)
+ struct sk_buff *skb, struct net_device *dev)
 {
struct mfc6_cache *c;
bool found = false;
@@ -1180,6 +1186,10 @@ static int ip6mr_cache_unresolved(struct mr_table *mrt, 
mifi_t mifi,
kfree_skb(skb);
err = -ENOBUFS;
} else {
+   if (dev) {
+   skb->dev = dev;
+   skb->skb_iif = dev->ifindex;
+   }
skb_queue_tail(>_c.mfc_un.unres.unresolved, skb);
err = 0;
}
@@ -2043,11 +2053,12 @@ static int ip6mr_find_vif(struct mr_table *mrt, struct 
net_device *dev)
 }
 
 static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
-  struct sk_buff *skb, struct mfc6_cache *c)
+  struct net_device *dev, struct sk_buff *skb,
+  struct mfc6_cache *c)
 {
int psend = -1;
int vif, ct;
-   int true_vifi = ip6mr_find_vif(mrt, skb->dev);
+   int true_vifi = ip6mr_find_vif(mrt, dev);
 
vif = c->_c.mfc_parent;
c->_c.mfc_un.res.pkt++;
@@ -2073,7 +2084,7 @@ static void ip6_mr_forward(struct net *net, struct 
mr_table *mrt,
/*
 * Wrong interface: drop packet and (maybe) send PIM assert.
 */
-   if (mrt->vif_table[vif].dev != skb->dev) {
+   if (mrt->vif_table[vif].dev != dev) {
c->_c.mfc_un.res.wrong_if++;
 
if (true_vifi >= 0 && mrt->mroute_do_assert &&
@@ -2146,6 +2157,7 @@ static void ip6_mr_forward(struct net *n

Re: [PATCH net-next 4/5] ipv6: do not drop vrf udp multicast packets

2018-09-20 Thread Mike Manning

On 20/09/2018 14:02, Paolo Abeni wrote:
> Hi,
>
> On Thu, 2018-09-20 at 09:58 +0100, Mike Manning wrote:
>> diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
>> index 108f5f88ec98..fc60f297d95b 100644
>> --- a/net/ipv6/ip6_input.c
>> +++ b/net/ipv6/ip6_input.c
>> @@ -325,9 +325,12 @@ static int ip6_input_finish(struct net *net, struct 
>> sock *sk, struct sk_buff *sk
>>  {
>>  const struct inet6_protocol *ipprot;
>>  struct inet6_dev *idev;
>> +struct net_device *dev;
>>  unsigned int nhoff;
>> +int sdif = inet6_sdif(skb);
>>  int nexthdr;
>>  bool raw;
>> +bool deliver;
>>  bool have_final = false;
> Please, try instead to sort the variable in reverse x-mas tree order.
Will do.
>>  
>>  /*
>> @@ -371,9 +374,27 @@ static int ip6_input_finish(struct net *net, struct 
>> sock *sk, struct sk_buff *sk
>>  skb_postpull_rcsum(skb, skb_network_header(skb),
>> skb_network_header_len(skb));
>>  hdr = ipv6_hdr(skb);
>> -if (ipv6_addr_is_multicast(>daddr) &&
>> -!ipv6_chk_mcast_addr(skb->dev, >daddr,
>> ->saddr) &&
>> +
>> +/* skb->dev passed may be master dev for vrfs. */
>> +if (sdif) {
>> +rcu_read_lock();
> AFAICS, the rcu lock is already acquired at the beginning of
> ip6_input_finish(), not need to acquire it here again.
Nice catch, I will remove this.
> + dev = dev_get_by_index_rcu(dev_net(skb->dev),
>> +   sdif);
>> +if (!dev) {
>> +rcu_read_unlock();
>> +kfree_skb(skb);
>> +return -ENODEV;
>> +}
>> +} else {
>> +dev = skb->dev;
> The above fragment of code is a recurring pattern in this series,
> perhaps adding an helper for it would reduce code duplication ?

This pattern of checking the secondary device index is used only twice, both in 
this file.

But with now one instance having the rcu lock handling, and the other not, I 
cannot refactor this.
>
> Cheers,
>
> Paolo
>
Thanks for the review! I will wait for further comments before producing a v1 
of the series.

Regards, Mike

[PATCH net-next 0/5] vrf: allow simultaneous service instances in default and other VRFs

2018-09-20 Thread Mike Manning

Services currently have to be VRF-aware if they are using an unbound
socket. One cannot have multiple service instances running in the
default and other VRFs for services that are not VRF-aware and listen
on an unbound socket. This is because there is no way of isolating
packets received in the default VRF from those arriving in other VRFs.

This series provides this isolation subject to the existing kernel
parameter net.ipv4.tcp_l3mdev_accept not being set, given that this is
documented as allowing a single service instance to work across all
VRF domains. The functionality applies to UDP & TCP services, for IPv4
and IPv6, in particular adding VRF table handling for IPv6 multicast.

Example of running ssh instances in default and blue VRF:

$ /usr/sbin/sshd -D
$ ip vrf exec vrf-blue /usr/sbin/sshd
$ ss -ta | egrep 'State|ssh'
State   Recv-Q   Send-Q   Local Address:Port   Peer Address:Port
LISTEN  0128   0.0.0.0%vrf-blue:ssh 0.0.0.0:*
LISTEN  01280.0.0.0:ssh 0.0.0.0:*
ESTAB   00  192.168.122.220:ssh   192.168.122.1:50282
LISTEN  0128  [::]%vrf-blue:ssh[::]:*
LISTEN  0128   [::]:ssh[::]:*
ESTAB   00   [3000::2]%vrf-blue:ssh   [3000::9]:45896
ESTAB   00[2000::2]:ssh   [2000::9]:46398

Dewi Morgan (1):
  ipv6: do not drop vrf udp multicast packets

Mike Manning (1):
  ipv6: allow link-local and multicast packets inside vrf

Patrick Ruddy (1):
  ipv6: add vrf table handling code for ipv6 mcast

Robert Shearman (2):
  net: allow binding socket in a VRF when there's an unbound socket
  ipv4: Allow sending multicast packets on specific i/f using VRF socket

 Documentation/networking/vrf.txt |  9 
 drivers/net/vrf.c| 30 
 include/net/inet6_hashtables.h   |  5 ++--
 include/net/inet_hashtables.h| 21 +++--
 include/net/inet_sock.h  | 13 +++
 net/core/sock.c  |  2 ++
 net/ipv4/datagram.c  |  2 +-
 net/ipv4/inet_connection_sock.c  | 13 ---
 net/ipv4/inet_hashtables.c   | 34 +---
 net/ipv4/ip_sockglue.c   |  3 +++
 net/ipv4/ping.c  |  2 +-
 net/ipv4/raw.c   |  6 ++---
 net/ipv4/udp.c   | 17 ++
 net/ipv6/datagram.c  |  5 +++-
 net/ipv6/inet6_hashtables.c  | 14 +---
 net/ipv6/ip6_input.c | 46 +
 net/ipv6/ip6mr.c | 49 ++--
 net/ipv6/ipv6_sockglue.c |  5 +++-
 net/ipv6/raw.c   |  6 ++---
 net/ipv6/udp.c   | 22 --
 20 files changed, 208 insertions(+), 96 deletions(-)

-- 
2.11.0

[PATCH net-next 2/5] ipv6: allow link-local and multicast packets inside vrf

2018-09-20 Thread Mike Manning

Packets that are multicast or to link-local addresses are not enslaved
to the vrf of the socket that they are received on. This is needed for
NDISC, but breaks applications that rely on receiving such packets when
in a VRF. Also to make IPv6 consistent with IPv4 which does handle
multicast packets as being enslaved, modify the VRF driver to do the
same for IPv6. As a result, the multicast address check needs to verify
the address against the enslaved rather than the l3mdev device.

Signed-off-by: Mike Manning 
---
 drivers/net/vrf.c| 19 +--
 net/ipv6/ip6_input.c | 19 ++-
 net/ipv6/ipv6_sockglue.c |  2 +-
 3 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index f93547f257fb..9d817c19f3b4 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -981,24 +981,23 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device 
*vrf_dev,
   struct sk_buff *skb)
 {
int orig_iif = skb->skb_iif;
-   bool need_strict;
+   bool need_strict = rt6_need_strict(_hdr(skb)->daddr);
+   bool is_ndisc = ipv6_ndisc_frame(skb);
 
-   /* loopback traffic; do not push through packet taps again.
-* Reset pkt_type for upper layers to process skb
+   /* loopback, multicast & non-ND link-local traffic; do not push through
+* packet taps again. Reset pkt_type for upper layers to process skb
 */
-   if (skb->pkt_type == PACKET_LOOPBACK) {
+   if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) {
skb->dev = vrf_dev;
skb->skb_iif = vrf_dev->ifindex;
IP6CB(skb)->flags |= IP6SKB_L3SLAVE;
-   skb->pkt_type = PACKET_HOST;
+   if (skb->pkt_type == PACKET_LOOPBACK)
+   skb->pkt_type = PACKET_HOST;
goto out;
}
 
-   /* if packet is NDISC or addressed to multicast or link-local
-* then keep the ingress interface
-*/
-   need_strict = rt6_need_strict(_hdr(skb)->daddr);
-   if (!ipv6_ndisc_frame(skb) && !need_strict) {
+   /* if packet is NDISC then keep the ingress interface */
+   if (!is_ndisc) {
vrf_rx_stats(vrf_dev, skb->len);
skb->dev = vrf_dev;
skb->skb_iif = vrf_dev->ifindex;
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 96577e742afd..108f5f88ec98 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -432,15 +432,32 @@ EXPORT_SYMBOL_GPL(ip6_input);
 
 int ip6_mc_input(struct sk_buff *skb)
 {
+   int sdif = inet6_sdif(skb);
const struct ipv6hdr *hdr;
+   struct net_device *dev;
bool deliver;
 
__IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev),
 __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST,
 skb->len);
 
+   /* skb->dev passed may be master dev for vrfs. */
+   if (sdif) {
+   rcu_read_lock();
+   dev = dev_get_by_index_rcu(dev_net(skb->dev), sdif);
+   if (!dev) {
+   rcu_read_unlock();
+   kfree_skb(skb);
+   return -ENODEV;
+   }
+   } else {
+   dev = skb->dev;
+   }
+
hdr = ipv6_hdr(skb);
-   deliver = ipv6_chk_mcast_addr(skb->dev, >daddr, NULL);
+   deliver = ipv6_chk_mcast_addr(dev, >daddr, NULL);
+   if (sdif)
+   rcu_read_unlock();
 
 #ifdef CONFIG_IPV6_MROUTE
/*
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 7dfbc797b130..4ebd395dd3df 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -486,7 +486,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, 
int optname,
retv = -EFAULT;
break;
}
-   if (sk->sk_bound_dev_if && pkt.ipi6_ifindex != 
sk->sk_bound_dev_if)
+   if (!sk_dev_equal_l3scope(sk, pkt.ipi6_ifindex))
goto e_inval;
 
np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex;
-- 
2.11.0

[PATCH net-next 5/5] ipv6: add vrf table handling code for ipv6 mcast

2018-09-20 Thread Mike Manning

From: Patrick Ruddy 

The code to obtain the correct table for the incoming interface was
missing for IPv6. This has been added along with the table creation
notification to fib rules for the RTNL_FAMILY_IP6MR address family.

Signed-off-by: Patrick Ruddy 
Signed-off-by: Mike Manning 
---
 drivers/net/vrf.c | 11 +++
 net/ipv6/ip6mr.c  | 49 +
 2 files changed, 48 insertions(+), 12 deletions(-)

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 9d817c19f3b4..21ad4b1d7f03 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -1214,8 +1214,19 @@ static int vrf_add_fib_rules(const struct net_device 
*dev)
goto ipmr_err;
 #endif
 
+#if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES)
+   err = vrf_fib_rule(dev, RTNL_FAMILY_IP6MR, true);
+   if (err < 0)
+   goto ip6mr_err;
+#endif
+
return 0;
 
+#if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES)
+ip6mr_err:
+   vrf_fib_rule(dev, RTNL_FAMILY_IPMR,  false);
+#endif
+
 #if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES)
 ipmr_err:
vrf_fib_rule(dev, AF_INET6,  false);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index d0b7e0249c13..1ecc88456dc5 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -85,7 +85,8 @@ static struct mr_table *ip6mr_new_table(struct net *net, u32 
id);
 static void ip6mr_free_table(struct mr_table *mrt);
 
 static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
-  struct sk_buff *skb, struct mfc6_cache *cache);
+  struct net_device *dev, struct sk_buff *skb,
+  struct mfc6_cache *cache);
 static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
  mifi_t mifi, int assert);
 static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc,
@@ -138,6 +139,9 @@ static int ip6mr_fib_lookup(struct net *net, struct flowi6 
*flp6,
.flags = FIB_LOOKUP_NOREF,
};
 
+   /* update flow if oif or iif point to device enslaved to l3mdev */
+   l3mdev_update_flow(net, flowi6_to_flowi(flp6));
+
err = fib_rules_lookup(net->ipv6.mr6_rules_ops,
   flowi6_to_flowi(flp6), 0, );
if (err < 0)
@@ -164,7 +168,9 @@ static int ip6mr_rule_action(struct fib_rule *rule, struct 
flowi *flp,
return -EINVAL;
}
 
-   mrt = ip6mr_get_table(rule->fr_net, rule->table);
+   arg->table = fib_rule_get_table(rule, arg);
+
+   mrt = ip6mr_get_table(rule->fr_net, arg->table);
if (!mrt)
return -EAGAIN;
res->mrt = mrt;
@@ -1014,7 +1020,7 @@ static void ip6mr_cache_resolve(struct net *net, struct 
mr_table *mrt,
}
rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
} else
-   ip6_mr_forward(net, mrt, skb, c);
+   ip6_mr_forward(net, mrt, skb->dev, skb, c);
}
 }
 
@@ -1120,7 +1126,7 @@ static int ip6mr_cache_report(struct mr_table *mrt, 
struct sk_buff *pkt,
 
 /* Queue a packet for resolution. It gets locked cache entry! */
 static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi,
- struct sk_buff *skb)
+ struct sk_buff *skb, struct net_device *dev)
 {
struct mfc6_cache *c;
bool found = false;
@@ -1180,6 +1186,10 @@ static int ip6mr_cache_unresolved(struct mr_table *mrt, 
mifi_t mifi,
kfree_skb(skb);
err = -ENOBUFS;
} else {
+   if (dev) {
+   skb->dev = dev;
+   skb->skb_iif = dev->ifindex;
+   }
skb_queue_tail(>_c.mfc_un.unres.unresolved, skb);
err = 0;
}
@@ -2043,11 +2053,12 @@ static int ip6mr_find_vif(struct mr_table *mrt, struct 
net_device *dev)
 }
 
 static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
-  struct sk_buff *skb, struct mfc6_cache *c)
+  struct net_device *dev, struct sk_buff *skb,
+  struct mfc6_cache *c)
 {
int psend = -1;
int vif, ct;
-   int true_vifi = ip6mr_find_vif(mrt, skb->dev);
+   int true_vifi = ip6mr_find_vif(mrt, dev);
 
vif = c->_c.mfc_parent;
c->_c.mfc_un.res.pkt++;
@@ -2073,7 +2084,7 @@ static void ip6_mr_forward(struct net *net, struct 
mr_table *mrt,
/*
 * Wrong interface: drop packet and (maybe) send PIM assert.
 */
-   if (mrt->vif_table[vif].dev != skb->dev) {
+   if (mrt->vif_table[vif].dev != dev) {
c->_c.mfc_un.res.wrong_if++;
 
if (true_vifi >= 0 && mrt->mroute_do_assert &&
@@ -2146,6 +2157,7 @@ static void ip6_mr_forward(struct net *n

[PATCH net-next 4/5] ipv6: do not drop vrf udp multicast packets

2018-09-20 Thread Mike Manning

From: Dewi Morgan 

For bound udp sockets in a vrf, also check the sdif to get the index
for ingress devices enslaved to an l3mdev. Verify the multicast address
against the enslaved rather than the l3mdev device.

Signed-off-by: Dewi Morgan 
Signed-off-by: Mike Manning 
---
 net/ipv6/ip6_input.c | 27 ---
 net/ipv6/udp.c   |  8 +---
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 108f5f88ec98..fc60f297d95b 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -325,9 +325,12 @@ static int ip6_input_finish(struct net *net, struct sock 
*sk, struct sk_buff *sk
 {
const struct inet6_protocol *ipprot;
struct inet6_dev *idev;
+   struct net_device *dev;
unsigned int nhoff;
+   int sdif = inet6_sdif(skb);
int nexthdr;
bool raw;
+   bool deliver;
bool have_final = false;
 
/*
@@ -371,9 +374,27 @@ static int ip6_input_finish(struct net *net, struct sock 
*sk, struct sk_buff *sk
skb_postpull_rcsum(skb, skb_network_header(skb),
   skb_network_header_len(skb));
hdr = ipv6_hdr(skb);
-   if (ipv6_addr_is_multicast(>daddr) &&
-   !ipv6_chk_mcast_addr(skb->dev, >daddr,
-   >saddr) &&
+
+   /* skb->dev passed may be master dev for vrfs. */
+   if (sdif) {
+   rcu_read_lock();
+   dev = dev_get_by_index_rcu(dev_net(skb->dev),
+  sdif);
+   if (!dev) {
+   rcu_read_unlock();
+   kfree_skb(skb);
+   return -ENODEV;
+   }
+   } else {
+   dev = skb->dev;
+   }
+
+   deliver = ipv6_chk_mcast_addr(dev, >daddr,
+ >saddr);
+   if (sdif)
+   rcu_read_unlock();
+
+   if (ipv6_addr_is_multicast(>daddr) && !deliver &&
!ipv6_is_mld(skb, nexthdr, 
skb_network_header_len(skb)))
goto discard;
}
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index e22b7dd78c9b..35f71b7a1070 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -637,7 +637,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct 
sk_buff *skb)
 static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
   __be16 loc_port, const struct in6_addr 
*loc_addr,
   __be16 rmt_port, const struct in6_addr 
*rmt_addr,
-  int dif, unsigned short hnum)
+  int dif, int sdif, unsigned short hnum)
 {
struct inet_sock *inet = inet_sk(sk);
 
@@ -649,7 +649,7 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct 
sock *sk,
(inet->inet_dport && inet->inet_dport != rmt_port) ||
(!ipv6_addr_any(>sk_v6_daddr) &&
!ipv6_addr_equal(>sk_v6_daddr, rmt_addr)) ||
-   (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) ||
+   !inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif) ||
(!ipv6_addr_any(>sk_v6_rcv_saddr) &&
!ipv6_addr_equal(>sk_v6_rcv_saddr, loc_addr)))
return false;
@@ -683,6 +683,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct 
sk_buff *skb,
unsigned int offset = offsetof(typeof(*sk), sk_node);
unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
int dif = inet6_iif(skb);
+   int sdif = inet6_sdif(skb);
struct hlist_node *node;
struct sk_buff *nskb;
 
@@ -697,7 +698,8 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct 
sk_buff *skb,
 
sk_for_each_entry_offset_rcu(sk, node, >head, offset) {
if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr,
-   uh->source, saddr, dif, hnum))
+   uh->source, saddr, dif, sdif,
+   hnum))
continue;
/* If zero checksum and no_check is not on for
 * the socket then skip it.
-- 
2.11.0

[PATCH net-next 1/5] net: allow binding socket in a VRF when there's an unbound socket

2018-09-20 Thread Mike Manning

From: Robert Shearman 

There is no easy way currently for applications that want to receive
packets in the default VRF to be isolated from packets arriving in
VRFs, which makes using VRF-unaware applications in a VRF-aware system
a potential security risk.

So change the inet socket lookup to avoid packets arriving on a device
enslaved to an l3mdev from matching unbound sockets by removing the
wildcard for non sk_bound_dev_if and instead relying on check against
the secondary device index, which will be 0 when the input device is
not enslaved to an l3mdev and so match against an unbound socket and
not match when the input device is enslaved.

The existing net.ipv4.tcp_l3mdev_accept & net.ipv4.udp_l3mdev_accept
sysctls, which are documented as allowing the working across all VRF
domains, can be used to also work in the default VRF by causing
unbound sockets to match against packets arriving on a device
enslaved to an l3mdev.

Change the socket binding to take the l3mdev into account to allow an
unbound socket to not conflict sockets bound to an l3mdev given the
datapath isolation now guaranteed.

Signed-off-by: Robert Shearman 
Signed-off-by: Mike Manning 
---
 Documentation/networking/vrf.txt |  9 +
 include/net/inet6_hashtables.h   |  5 ++---
 include/net/inet_hashtables.h| 21 ++---
 include/net/inet_sock.h  | 13 +
 net/core/sock.c  |  2 ++
 net/ipv4/inet_connection_sock.c  | 13 ++---
 net/ipv4/inet_hashtables.c   | 34 +-
 net/ipv4/ip_sockglue.c   |  3 +++
 net/ipv4/raw.c   |  4 ++--
 net/ipv4/udp.c   | 15 ++-
 net/ipv6/datagram.c  |  5 -
 net/ipv6/inet6_hashtables.c  | 14 ++
 net/ipv6/ipv6_sockglue.c |  3 +++
 net/ipv6/raw.c   |  6 +++---
 net/ipv6/udp.c   | 14 +-
 15 files changed, 99 insertions(+), 62 deletions(-)

diff --git a/Documentation/networking/vrf.txt b/Documentation/networking/vrf.txt
index 8ff7b4c8f91b..d4b129402d57 100644
--- a/Documentation/networking/vrf.txt
+++ b/Documentation/networking/vrf.txt
@@ -103,6 +103,11 @@ VRF device:
 
 or to specify the output device using cmsg and IP_PKTINFO.
 
+By default the scope of the port bindings for unbound sockets is
+limited to the default VRF. That is, it will not be matched by packets
+arriving on interfaces enslaved to an l3mdev and processes may bind to
+the same port if they bind to an l3mdev.
+
 TCP & UDP services running in the default VRF context (ie., not bound
 to any VRF device) can work across all VRF domains by enabling the
 tcp_l3mdev_accept and udp_l3mdev_accept sysctl options:
@@ -112,10 +117,6 @@ tcp_l3mdev_accept and udp_l3mdev_accept sysctl options:
 netfilter rules on the VRF device can be used to limit access to services
 running in the default VRF context as well.
 
-The default VRF does not have limited scope with respect to port bindings.
-That is, if a process does a wildcard bind to a port in the default VRF it
-owns the port across all VRF domains within the network namespace.
-
 

 
 Using iproute2 for VRFs
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index 6e91e38a31da..9db98af46985 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -115,9 +115,8 @@ int inet6_hash(struct sock *sk);
 ((__sk)->sk_family == AF_INET6)&&  \
 ipv6_addr_equal(&(__sk)->sk_v6_daddr, (__saddr))   &&  
\
 ipv6_addr_equal(&(__sk)->sk_v6_rcv_saddr, (__daddr))   &&  \
-(!(__sk)->sk_bound_dev_if  ||  \
-  ((__sk)->sk_bound_dev_if == (__dif)) ||  \
-  ((__sk)->sk_bound_dev_if == (__sdif)))   &&  \
+(((__sk)->sk_bound_dev_if == (__dif))  ||  \
+ ((__sk)->sk_bound_dev_if == (__sdif)))&&  \
 net_eq(sock_net(__sk), (__net)))
 
 #endif /* _INET6_HASHTABLES_H */
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 9141e95529e7..ec279bcd0958 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -79,6 +79,7 @@ struct inet_ehash_bucket {
 
 struct inet_bind_bucket {
possible_net_t  ib_net;
+   int l3mdev;
unsigned short  port;
signed char fastreuse;
signed char fastreuseport;
@@ -188,10 +189,18 @@ static inline void inet_ehash_locks_free(struct 
inet_hashinfo *hashinfo)
hashinfo->ehash_locks = NULL;
 }
 
+static inline bool inet_sk_bound_dev_eq(struct net *net, int bound_dev_if,
+

[PATCH net-next 3/5] ipv4: Allow sending multicast packets on specific i/f using VRF socket

2018-09-20 Thread Mike Manning

From: Robert Shearman 

It is useful to be able to use the same socket for listening in a
specific VRF, as for sending multicast packets out of a specific
interface. However, the bound device on the socket currently takes
precedence and results in the packets not being sent.

Relax the condition on overriding the output interface to use for
sending packets out of UDP, raw and ping sockets to allow multicast
packets to be sent using the specified multicast interface.

Signed-off-by: Robert Shearman 
Signed-off-by: Mike Manning 
---
 net/ipv4/datagram.c | 2 +-
 net/ipv4/ping.c | 2 +-
 net/ipv4/raw.c  | 2 +-
 net/ipv4/udp.c  | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index f915abff1350..300921417f89 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -42,7 +42,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr 
*uaddr, int addr_len
oif = sk->sk_bound_dev_if;
saddr = inet->inet_saddr;
if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
-   if (!oif)
+   if (!oif || netif_index_is_l3_master(sock_net(sk), oif))
oif = inet->mc_index;
if (!saddr)
saddr = inet->mc_addr;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 8d7aaf118a30..7ccb5f87f70b 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -779,7 +779,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr 
*msg, size_t len)
}
 
if (ipv4_is_multicast(daddr)) {
-   if (!ipc.oif)
+   if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
ipc.oif = inet->mc_index;
if (!saddr)
saddr = inet->mc_addr;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 8a0d568d7aec..c55ef53d87a8 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -608,7 +608,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, 
size_t len)
tos |= RTO_ONLINK;
 
if (ipv4_is_multicast(daddr)) {
-   if (!ipc.oif)
+   if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
ipc.oif = inet->mc_index;
if (!saddr)
saddr = inet->mc_addr;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 3d59ab47a85d..f81097843031 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1039,7 +1039,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, 
size_t len)
}
 
if (ipv4_is_multicast(daddr)) {
-   if (!ipc.oif)
+   if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
ipc.oif = inet->mc_index;
if (!saddr)
saddr = inet->mc_addr;
-- 
2.11.0

[PATCH net-next] ipv6: Allow the l3mdev to be a loopback

2018-09-19 Thread Mike Manning

From: Robert Shearman 

There is no way currently for an IPv6 client connect using a loopback
address in a VRF, whereas for IPv4 the loopback address can be added:

$ sudo ip addr add dev vrfred 127.0.0.1/8
$ sudo ip -6 addr add ::1/128 dev vrfred
RTNETLINK answers: Cannot assign requested address

So allow ::1 to be configured on an L3 master device. In order for
this to be usable ip_route_output_flags needs to not consider ::1 to
be a link scope address (since oif == l3mdev and so it would be
dropped), and ipv6_rcv needs to consider the l3mdev to be a loopback
device so that it doesn't drop the packets.

Signed-off-by: Robert Shearman 
Signed-off-by: Mike Manning 
---
 net/ipv6/addrconf.c  | 1 +
 net/ipv6/ip6_input.c | 3 ++-
 net/ipv6/route.c | 3 ++-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index d4733160e6b7..bfe3ec7ecb14 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -997,6 +997,7 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config 
*cfg,
if (addr_type == IPV6_ADDR_ANY ||
addr_type & IPV6_ADDR_MULTICAST ||
(!(idev->dev->flags & IFF_LOOPBACK) &&
+!netif_is_l3_master(idev->dev) &&
 addr_type & IPV6_ADDR_LOOPBACK))
return ERR_PTR(-EADDRNOTAVAIL);
 
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 6242682be876..96577e742afd 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -178,7 +178,8 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, 
struct net_device *dev,
 */
if ((ipv6_addr_loopback(>saddr) ||
 ipv6_addr_loopback(>daddr)) &&
-!(dev->flags & IFF_LOOPBACK))
+   !(dev->flags & IFF_LOOPBACK) &&
+   !netif_is_l3_master(dev))
goto err;
 
/* RFC4291 Errata ID: 3480
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 0fa62acc923c..f36ee8a3314f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2098,7 +2098,8 @@ struct dst_entry *ip6_route_output_flags(struct net *net, 
const struct sock *sk,
 {
bool any_src;
 
-   if (rt6_need_strict(>daddr)) {
+   if (ipv6_addr_type(>daddr) &
+   (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
struct dst_entry *dst;
 
dst = l3mdev_link_scope_lookup(net, fl6);
-- 
2.11.0

[PATCH net-next] ipv6: Allow the l3mdev to be a loopback

2018-09-19 Thread Mike Manning

There is no way currently for an IPv6 client connect using a loopback
address in a VRF, whereas for IPv4 the loopback address can be added:

$ sudo ip addr add dev vrfred 127.0.0.1/8
$ sudo ip -6 addr add ::1/128 dev vrfred
RTNETLINK answers: Cannot assign requested address

So allow ::1 to be configured on an L3 master device. In order for
this to be usable ip_route_output_flags needs to not consider ::1 to
be a link scope address (since oif == l3mdev and so it would be
dropped), and ipv6_rcv needs to consider the l3mdev to be a loopback
device so that it doesn't drop the packets.

Signed-off-by: Robert Shearman 
---
 net/ipv6/addrconf.c  | 1 +
 net/ipv6/ip6_input.c | 3 ++-
 net/ipv6/route.c | 3 ++-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index d4733160e6b7..bfe3ec7ecb14 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -997,6 +997,7 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config 
*cfg,
if (addr_type == IPV6_ADDR_ANY ||
addr_type & IPV6_ADDR_MULTICAST ||
(!(idev->dev->flags & IFF_LOOPBACK) &&
+!netif_is_l3_master(idev->dev) &&
 addr_type & IPV6_ADDR_LOOPBACK))
return ERR_PTR(-EADDRNOTAVAIL);
 
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 6242682be876..96577e742afd 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -178,7 +178,8 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, 
struct net_device *dev,
 */
if ((ipv6_addr_loopback(>saddr) ||
 ipv6_addr_loopback(>daddr)) &&
-!(dev->flags & IFF_LOOPBACK))
+   !(dev->flags & IFF_LOOPBACK) &&
+   !netif_is_l3_master(dev))
goto err;
 
/* RFC4291 Errata ID: 3480
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 0fa62acc923c..f36ee8a3314f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2098,7 +2098,8 @@ struct dst_entry *ip6_route_output_flags(struct net *net, 
const struct sock *sk,
 {
bool any_src;
 
-   if (rt6_need_strict(>daddr)) {
+   if (ipv6_addr_type(>daddr) &
+   (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
struct dst_entry *dst;
 
dst = l3mdev_link_scope_lookup(net, fl6);
-- 
2.11.0

[PATCH] net: allow interface to be set into VRF if VLAN interface in same VRF

2018-02-26 Thread Mike Manning

Setting an interface into a VRF fails with 'RTNETLINK answers: File
exists' if one of its VLAN interfaces is already in the same VRF.
As the VRF is an upper device of the VLAN interface, it is also showing
up as an upper device of the interface itself. The solution is to
restrict this check to devices other than master. As only one master
device can be linked to a device, the check in this case is that the
upper device (VRF) being linked to is not the same as the master device
instead of it not being any one of the upper devices.

The following example shows an interface ens12 (with a VLAN interface
ens12.10) being set into VRF green, which behaves as expected:

  # ip link add link ens12 ens12.10 type vlan id 10
  # ip link set dev ens12 master vrfgreen
  # ip link show dev ens12
3: ens12: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel
   master vrfgreen state UP mode DEFAULT group default qlen 1000
   link/ether 52:54:00:4c:a0:45 brd ff:ff:ff:ff:ff:ff

But if the VLAN interface has previously been set into the same VRF,
then setting the interface into the VRF fails:

  # ip link set dev ens12 nomaster
  # ip link set dev ens12.10 master vrfgreen
  # ip link show dev ens12.10
39: ens12.10@ens12: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500
qdisc noqueue master vrfgreen state UP mode DEFAULT group default
qlen 1000 link/ether 52:54:00:4c:a0:45 brd ff:ff:ff:ff:ff:ff
  # ip link set dev ens12 master vrfgreen
RTNETLINK answers: File exists

The workaround is to move the VLAN interface back into the default VRF
beforehand, but it has to be shut first so as to avoid the risk of
traffic leaking from the VRF. This fix avoids needing this workaround.

Signed-off-by: Mike Manning <mmann...@att.com>
---
 net/core/dev.c | 14 +-
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index d4362be..2cedf52 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6396,6 +6396,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
.linking = true,
.upper_info = upper_info,
};
+   struct net_device *master_dev;
int ret = 0;
 
ASSERT_RTNL();
@@ -6407,11 +6408,14 @@ static int __netdev_upper_dev_link(struct net_device 
*dev,
if (netdev_has_upper_dev(upper_dev, dev))
return -EBUSY;
 
-   if (netdev_has_upper_dev(dev, upper_dev))
-   return -EEXIST;
-
-   if (master && netdev_master_upper_dev_get(dev))
-   return -EBUSY;
+   if (!master) {
+   if (netdev_has_upper_dev(dev, upper_dev))
+   return -EEXIST;
+   } else {
+   master_dev = netdev_master_upper_dev_get(dev);
+   if (master_dev)
+   return master_dev == upper_dev ? -EEXIST : -EBUSY;
+   }
 
ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
_info.info);
-- 
2.1.4

[PATCH] net: allow interface to be set into vrf if a vif in same vrf

2018-02-26 Thread Mike Manning

Setting an interface into a vrf fails with 'RTNETLINK answers: File
exists' if one of its vifs is already in the same vrf. As the vrf is an
upper device of the vif, it is also showing up as an upper device of
the interface itself. The solution is to restrict this check to devices
other than master. As only one master device can be linked to a device,
in this case the check is for the upper device (vrf) to be linked to as
being the master device rather than any other upper device.

Signed-off-by: Mike Manning <mmann...@att.com>
---
 net/core/dev.c | 14 +-
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index d4362be..2cedf52 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6396,6 +6396,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
.linking = true,
.upper_info = upper_info,
};
+   struct net_device *master_dev;
int ret = 0;
 
ASSERT_RTNL();
@@ -6407,11 +6408,14 @@ static int __netdev_upper_dev_link(struct net_device 
*dev,
if (netdev_has_upper_dev(upper_dev, dev))
return -EBUSY;
 
-   if (netdev_has_upper_dev(dev, upper_dev))
-   return -EEXIST;
-
-   if (master && netdev_master_upper_dev_get(dev))
-   return -EBUSY;
+   if (!master) {
+   if (netdev_has_upper_dev(dev, upper_dev))
+   return -EEXIST;
+   } else {
+   master_dev = netdev_master_upper_dev_get(dev);
+   if (master_dev)
+   return master_dev == upper_dev ? -EEXIST : -EBUSY;
+   }
 
ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
_info.info);
-- 
2.1.4

[PATCH net-next] net: ipv6: send NS for DAD when link operationally up

2017-09-25 Thread Mike Manning

The NS for DAD are sent on admin up as long as a valid qdisc is found.
A race condition exists by which these packets will not egress the
interface if the operational state of the lower device is not yet up.
The solution is to delay DAD until the link is operationally up
according to RFC2863. Rather than only doing this, follow the existing
code checks by deferring IPv6 device initialization altogether. The fix
allows DAD on devices like tunnels that are controlled by userspace
control plane. The fix has no impact on regular deployments, but means
that there is no IPv6 connectivity until the port has been opened in
the case of port-based network access control, which should be
desirable.

Signed-off-by: Mike Manning <mmann...@brocade.com>
---
 net/ipv6/addrconf.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index c2e2a78..dffbf3b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -303,10 +303,10 @@ static struct ipv6_devconf ipv6_devconf_dflt 
__read_mostly = {
.disable_policy = 0,
 };
 
-/* Check if a valid qdisc is available */
-static inline bool addrconf_qdisc_ok(const struct net_device *dev)
+/* Check if link is ready: is it up and is a valid qdisc available */
+static inline bool addrconf_link_ready(const struct net_device *dev)
 {
-   return !qdisc_tx_is_noop(dev);
+   return netif_oper_up(dev) && !qdisc_tx_is_noop(dev);
 }
 
 static void addrconf_del_rs_timer(struct inet6_dev *idev)
@@ -451,7 +451,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device 
*dev)
 
ndev->token = in6addr_any;
 
-   if (netif_running(dev) && addrconf_qdisc_ok(dev))
+   if (netif_running(dev) && addrconf_link_ready(dev))
ndev->if_flags |= IF_READY;
 
ipv6_mc_init_dev(ndev);
@@ -3393,7 +3393,7 @@ static int addrconf_notify(struct notifier_block *this, 
unsigned long event,
/* restore routes for permanent addresses */
addrconf_permanent_addr(dev);
 
-   if (!addrconf_qdisc_ok(dev)) {
+   if (!addrconf_link_ready(dev)) {
/* device is not ready yet. */
pr_info("ADDRCONF(NETDEV_UP): %s: link is not 
ready\n",
dev->name);
@@ -3408,7 +3408,7 @@ static int addrconf_notify(struct notifier_block *this, 
unsigned long event,
run_pending = 1;
}
} else if (event == NETDEV_CHANGE) {
-   if (!addrconf_qdisc_ok(dev)) {
+   if (!addrconf_link_ready(dev)) {
/* device is still not ready. */
break;
}
-- 
2.1.4

Re: [PATCH] net: ipv6: fix regression of no RTM_DELADDR sent after DAD failure

2017-09-18 Thread Mike Manning

Hi,
In the absence of a reply from Mahesh, I would be most grateful for
anyone familiar with the IPv6 code to review this 1-line fix.

Or if not, then I request that the commit f784ad3d79e5 is backed out,
as its intention is to remove the redundant but harmless RTM_DELADDR
for addresses in tentative state, but is also incorrectly removing the
very necessary RTM_DELADDR when an address is deleted that was previously
notified with an RTM_NEWADDR as being in tentative dadfailed state.

Thanks
Mike

On 08/09/17 03:18, David Miller wrote:
> From: Mike Manning <mmann...@brocade.com>
> Date: Mon,  4 Sep 2017 15:52:55 +0100
> 
>> Commit f784ad3d79e5 ("ipv6: do not send RTM_DELADDR for tentative
>> addresses") incorrectly assumes that no RTM_NEWADDR are sent for
>> addresses in tentative state, as this does happen for the standard
>> IPv6 use-case of DAD failure, see the call to ipv6_ifa_notify() in
>> addconf_dad_stop(). So as a result of this change, no RTM_DELADDR is
>> sent after DAD failure for a link-local when strict DAD (accept_dad=2)
>> is configured, or on the next admin down in other cases. The absence
>> of this notification breaks backwards compatibility and causes problems
>> after DAD failure if this notification was being relied on. The
>> solution is to allow RTM_DELADDR to still be sent after DAD failure.
>>
>> Fixes: f784ad3d79e5("ipv6: do not send RTM_DELADDR for tentative addresses")
>> Signed-off-by: Mike Manning <mmann...@brocade.com>
>> Cc: Mahesh Bandewar <mahe...@google.com>
> 
> Mahesh, please review this patch.
>

[PATCH] net: ipv6: fix regression of no RTM_DELADDR sent after DAD failure

2017-09-04 Thread Mike Manning

Commit f784ad3d79e5 ("ipv6: do not send RTM_DELADDR for tentative
addresses") incorrectly assumes that no RTM_NEWADDR are sent for
addresses in tentative state, as this does happen for the standard
IPv6 use-case of DAD failure, see the call to ipv6_ifa_notify() in
addconf_dad_stop(). So as a result of this change, no RTM_DELADDR is
sent after DAD failure for a link-local when strict DAD (accept_dad=2)
is configured, or on the next admin down in other cases. The absence
of this notification breaks backwards compatibility and causes problems
after DAD failure if this notification was being relied on. The
solution is to allow RTM_DELADDR to still be sent after DAD failure.

Fixes: f784ad3d79e5("ipv6: do not send RTM_DELADDR for tentative addresses")
Signed-off-by: Mike Manning <mmann...@brocade.com>
Cc: Mahesh Bandewar <mahe...@google.com>
---
 net/ipv6/addrconf.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 936e9ab..ba757c2 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4982,9 +4982,10 @@ static void inet6_ifa_notify(int event, struct 
inet6_ifaddr *ifa)
 
/* Don't send DELADDR notification for TENTATIVE address,
 * since NEWADDR notification is sent only after removing
-* TENTATIVE flag.
+* TENTATIVE flag, if DAD has not failed.
 */
-   if (ifa->flags & IFA_F_TENTATIVE && event == RTM_DELADDR)
+   if (ifa->flags & IFA_F_TENTATIVE && !(ifa->flags & IFA_F_DADFAILED) &&
+   event == RTM_DELADDR)
return;
 
skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC);
-- 
2.1.4

Re: [PATCH] net: ipv6: Fix warning of freeing alive inet6 address

2017-05-05 Thread Mike Manning

On 03/05/17 19:24, Mike Manning wrote:
> On 03/05/17 18:58, Cong Wang wrote:
>> On Tue, May 2, 2017 at 11:30 AM, Mike Manning <mmann...@brocade.com> wrote:
>>> While this is not reproducible manually, Andrey's syzkaller program hit
>>> the warning "IPv6: Freeing alive inet6 address" with this part trace:
>>>
>>> inet6_ifa_finish_destroy+0x12e/0x190 c:894
>>> in6_ifa_put ./include/net/addrconf.h:330
>>> addrconf_dad_work+0x4e9/0x1040 net/ipv6/addrconf.c:3963
>>>
>>> The fix is to call in6_ifa_put() for the inet6_ifaddr before rather
>>> than after calling addrconf_ifdown(), as the latter may remove it from
>>> the address hash table.
>>>
>>> Fixes: 85b51b12115c ("net: ipv6: Remove addresses for failures with strict 
>>> DAD")
>>> Reported-by: Andrey Konovalov <andreyk...@google.com>
>>> Signed-off-by: Mike Manning <mmann...@brocade.com>
>>> ---
>>>  net/ipv6/addrconf.c | 6 +-
>>>  1 file changed, 5 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
>>> index 80ce478..361993a 100644
>>> --- a/net/ipv6/addrconf.c
>>> +++ b/net/ipv6/addrconf.c
>>> @@ -3902,8 +3902,11 @@ static void addrconf_dad_work(struct work_struct *w)
>>> } else if (action == DAD_ABORT) {
>>> in6_ifa_hold(ifp);
>>> addrconf_dad_stop(ifp, 1);
>>> -   if (disable_ipv6)
>>> +   if (disable_ipv6) {
>>> +   in6_ifa_put(ifp);
>>> addrconf_ifdown(idev->dev, 0);
>>> +   goto unlock;
>>> +   }
>>
>>
>> But addrconf_dad_stop() calls ipv6_del_addr() which could unhash
>> the addr too...
>>

Further investigation shows that none of the code block above is at fault. 
Debugging
shows that the problem is happening with DAD_BEGIN and not DAD_ABORT. Follows 
more
detail on the issue, but as I do not have a fix at this stage, I retract this
submission altogether.

The problem is due to rapidly adding the same address fd00::bb on ip6tnl0, and 
also
without running DAD (accept_dad < 1), so it's an edge case. Typically the call 
to
addrconf_dad_work() starts with an ifp refcnt of 3. Then via 
addrconf_dad_begin()
and addrconf_dad_completed(), the call to addrconf_del_dad_work() results in a 
dec
of the refcnt to 2 due to the call to cancel_delayed_work() returning 1.

The 2nd normal case is if the call to addrconf_dad_work() starts with an ifp 
refcnt of
2, in which case the call to cancel_delayed_work() returns 0 and so no decrement
of the refcnt, which correctly stays at 2.

The error case is when the call to addrconf_dad_work() starts with an ifp 
refcnt of
2, but the call to cancel_delayed_work() then also results in a dec of the 
refcnt to 1,
so the final in6_ifa_put() detects that the refcnt is being reduced to 0 for an 
active
address.

So the question is whether the interaction of cancel_delayed_work() in 
addrconf_dad_work(), delayed_work_pending() in addrconf_mod_dad_work() and
INIT_DELAYED_WORK in ipv6_add_addr() [along with the handling for this when 
deleting
addresses] needs improving, and if so how?

Re: [PATCH] net: ipv6: Fix warning of freeing alive inet6 address

2017-05-03 Thread Mike Manning

On 03/05/17 18:58, Cong Wang wrote:
> On Tue, May 2, 2017 at 11:30 AM, Mike Manning <mmann...@brocade.com> wrote:
>> While this is not reproducible manually, Andrey's syzkaller program hit
>> the warning "IPv6: Freeing alive inet6 address" with this part trace:
>>
>> inet6_ifa_finish_destroy+0x12e/0x190 c:894
>> in6_ifa_put ./include/net/addrconf.h:330
>> addrconf_dad_work+0x4e9/0x1040 net/ipv6/addrconf.c:3963
>>
>> The fix is to call in6_ifa_put() for the inet6_ifaddr before rather
>> than after calling addrconf_ifdown(), as the latter may remove it from
>> the address hash table.
>>
>> Fixes: 85b51b12115c ("net: ipv6: Remove addresses for failures with strict 
>> DAD")
>> Reported-by: Andrey Konovalov <andreyk...@google.com>
>> Signed-off-by: Mike Manning <mmann...@brocade.com>
>> ---
>>  net/ipv6/addrconf.c | 6 +-
>>  1 file changed, 5 insertions(+), 1 deletion(-)
>>
>> diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
>> index 80ce478..361993a 100644
>> --- a/net/ipv6/addrconf.c
>> +++ b/net/ipv6/addrconf.c
>> @@ -3902,8 +3902,11 @@ static void addrconf_dad_work(struct work_struct *w)
>> } else if (action == DAD_ABORT) {
>> in6_ifa_hold(ifp);
>> addrconf_dad_stop(ifp, 1);
>> -   if (disable_ipv6)
>> +   if (disable_ipv6) {
>> +   in6_ifa_put(ifp);
>> addrconf_ifdown(idev->dev, 0);
>> +   goto unlock;
>> +   }
> 
> 
> But addrconf_dad_stop() calls ipv6_del_addr() which could unhash
> the addr too...
> 

Agreed, and in the mean time Andrey has confirmed that this v1 patch
does not resolve the issue. The problem is not specific to my change
for removing addresses. It seems that generally here the in6_ifa_hold()
and matching in6_ifa_put() are surplus to requirement, as the address
refcnt is 2 even without the hold before calling DAD stop.

Re: [PATCH] net: ipv6: Fix warning of freeing alive inet6 address

2017-05-03 Thread Mike Manning

On reflection, please put this on hold subject to testing with syzkaller. I 
have not had a repro of the issue and so the fix even though harmless may not 
be effective.

Thanks
Mike

On 02/05/17 19:30, Mike Manning wrote:
> While this is not reproducible manually, Andrey's syzkaller program hit
> the warning "IPv6: Freeing alive inet6 address" with this part trace:
> 
> inet6_ifa_finish_destroy+0x12e/0x190 c:894
> in6_ifa_put ./include/net/addrconf.h:330
> addrconf_dad_work+0x4e9/0x1040 net/ipv6/addrconf.c:3963
> 
> The fix is to call in6_ifa_put() for the inet6_ifaddr before rather
> than after calling addrconf_ifdown(), as the latter may remove it from
> the address hash table.
> 
> Fixes: 85b51b12115c ("net: ipv6: Remove addresses for failures with strict 
> DAD")
> Reported-by: Andrey Konovalov <andreyk...@google.com>
> Signed-off-by: Mike Manning <mmann...@brocade.com>
> ---
>  net/ipv6/addrconf.c | 6 +-
>  1 file changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
> index 80ce478..361993a 100644
> --- a/net/ipv6/addrconf.c
> +++ b/net/ipv6/addrconf.c
> @@ -3902,8 +3902,11 @@ static void addrconf_dad_work(struct work_struct *w)
>   } else if (action == DAD_ABORT) {
>   in6_ifa_hold(ifp);
>   addrconf_dad_stop(ifp, 1);
> - if (disable_ipv6)
> + if (disable_ipv6) {
> + in6_ifa_put(ifp);
>   addrconf_ifdown(idev->dev, 0);
> + goto unlock;
> + }
>   goto out;
>   }
>  
> @@ -3950,6 +3953,7 @@ static void addrconf_dad_work(struct work_struct *w)
> ifp->dad_nonce);
>  out:
>   in6_ifa_put(ifp);
> +unlock:
>   rtnl_unlock();
>  }
>  
>

[PATCH] net: ipv6: Fix warning of freeing alive inet6 address

2017-05-02 Thread Mike Manning

While this is not reproducible manually, Andrey's syzkaller program hit
the warning "IPv6: Freeing alive inet6 address" with this part trace:

inet6_ifa_finish_destroy+0x12e/0x190 c:894
in6_ifa_put ./include/net/addrconf.h:330
addrconf_dad_work+0x4e9/0x1040 net/ipv6/addrconf.c:3963

The fix is to call in6_ifa_put() for the inet6_ifaddr before rather
than after calling addrconf_ifdown(), as the latter may remove it from
the address hash table.

Fixes: 85b51b12115c ("net: ipv6: Remove addresses for failures with strict DAD")
Reported-by: Andrey Konovalov <andreyk...@google.com>
Signed-off-by: Mike Manning <mmann...@brocade.com>
---
 net/ipv6/addrconf.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 80ce478..361993a 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3902,8 +3902,11 @@ static void addrconf_dad_work(struct work_struct *w)
} else if (action == DAD_ABORT) {
in6_ifa_hold(ifp);
addrconf_dad_stop(ifp, 1);
-   if (disable_ipv6)
+   if (disable_ipv6) {
+   in6_ifa_put(ifp);
addrconf_ifdown(idev->dev, 0);
+   goto unlock;
+   }
goto out;
}
 
@@ -3950,6 +3953,7 @@ static void addrconf_dad_work(struct work_struct *w)
  ifp->dad_nonce);
 out:
in6_ifa_put(ifp);
+unlock:
rtnl_unlock();
 }
 
-- 
2.1.4

Re: net/ipv6: warning in inet6_ifa_finish_destroy

2017-05-02 Thread Mike Manning

On 28/04/17 21:39, Cong Wang wrote:
> On Fri, Apr 28, 2017 at 6:08 AM, Andrey Konovalov  
> wrote:
>> Hi,
>>
>> I've got the following error report while fuzzing the kernel with syzkaller.
>>
>> On commit 5a7ad1146caa895ad718a534399e38bd2ba721b7 (4.11-rc8).
>>
>> C reproducer and .config are attached.
>> It takes 1-2 minutes of running the reproducer to trigger the issue.
>>
>> [ cut here ]
>> WARNING: CPU: 0 PID: 21 at net/ipv6/addrconf.c:894
>> inet6_ifa_finish_destroy+0x12e/0x190
>> Modules linked in:
>> CPU: 0 PID: 21 Comm: kworker/0:1 Not tainted 4.11.0-rc8+ #296
>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
>> Workqueue: ipv6_addrconf addrconf_dad_work
>> Call Trace:
>>  __dump_stack lib/dump_stack.c:16
>>  dump_stack+0x292/0x398 lib/dump_stack.c:52
>>  __warn+0x19f/0x1e0 kernel/panic.c:549
>>  warn_slowpath_null+0x2c/0x40 kernel/panic.c:584
>>  inet6_ifa_finish_destroy+0x12e/0x190 c:894
>>  in6_ifa_put ./include/net/addrconf.h:330
>>  addrconf_dad_work+0x4e9/0x1040 net/ipv6/addrconf.c:3963
> 
> 
> I don't look too much, but a quick glance shows in the following
> path:
> 
> } else if (action == DAD_ABORT) {
> in6_ifa_hold(ifp);
> addrconf_dad_stop(ifp, 1);
> if (disable_ipv6)
> addrconf_ifdown(idev->dev, 0);
> goto out;
> }
> 
> the inet6_addr could be removed from hash table in
> addrconf_ifdown() before calling in6_ifa_put(). which causes
> this warning.
> 

git describe 85b51b12115c79cce7ea1ced6c0bd0339a165d3f --contains
v4.8-rc5~34^2~32

This fix was introduced in 4.8, so it is interesting that this problem is only 
showing up now for 4.11. Also, it is not reproducible manually, i.e. DAD 
failure with disable_ipv6 works just fine without triggering this warning, with 
or without keeping IPv6 addresses on admin down.

I will go ahead with putting out a fix so that in6_ifa_put() precedes the call 
to addrconf_ifdown() in this case.

Thanks for the heads up on this,
Mike

[PATCH v2 net-next] bridge: add per-port broadcast flood flag

2017-04-26 Thread Mike Manning

Support for l2 multicast flood control was added in commit b6cb5ac8331b
("net: bridge: add per-port multicast flood flag"). It allows broadcast
as it was introduced specifically for unknown multicast flood control.
But as broadcast is a special case of multicast, this may also need to
be disabled. For this purpose, introduce a flag to disable the flooding
of received l2 broadcasts. This approach is backwards compatible and
provides flexibility in filtering for the desired packet types.

Cc: Nikolay Aleksandrov <niko...@cumulusnetworks.com>
Signed-off-by: Mike Manning <mmann...@brocade.com>
---
 include/linux/if_bridge.h|  1 +
 include/uapi/linux/if_link.h |  1 +
 net/bridge/br_forward.c  | 24 +---
 net/bridge/br_if.c   |  2 +-
 net/bridge/br_netlink.c  |  3 +++
 net/bridge/br_sysfs_if.c |  2 ++
 6 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index c5847dc..0c16866 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -48,6 +48,7 @@ struct br_ip_list {
 #define BR_MCAST_FLOOD BIT(11)
 #define BR_MULTICAST_TO_UNICASTBIT(12)
 #define BR_VLAN_TUNNEL BIT(13)
+#define BR_BCAST_FLOOD BIT(14)
 
 #define BR_DEFAULT_AGEING_TIME (300 * HZ)
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 633aa02..8e56ac7 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -323,6 +323,7 @@ enum {
IFLA_BRPORT_MCAST_FLOOD,
IFLA_BRPORT_MCAST_TO_UCAST,
IFLA_BRPORT_VLAN_TUNNEL,
+   IFLA_BRPORT_BCAST_FLOOD,
__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 902af6b..48fb174 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -183,13 +183,23 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
struct net_bridge_port *p;
 
list_for_each_entry_rcu(p, >port_list, list) {
-   /* Do not flood unicast traffic to ports that turn it off */
-   if (pkt_type == BR_PKT_UNICAST && !(p->flags & BR_FLOOD))
-   continue;
-   /* Do not flood if mc off, except for traffic we originate */
-   if (pkt_type == BR_PKT_MULTICAST &&
-   !(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev)
-   continue;
+   /* Do not flood unicast traffic to ports that turn it off, nor
+* other traffic if flood off, except for traffic we originate
+*/
+   switch (pkt_type) {
+   case BR_PKT_UNICAST:
+   if (!(p->flags & BR_FLOOD))
+   continue;
+   break;
+   case BR_PKT_MULTICAST:
+   if (!(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev)
+   continue;
+   break;
+   case BR_PKT_BROADCAST:
+   if (!(p->flags & BR_BCAST_FLOOD) && skb->dev != br->dev)
+   continue;
+   break;
+   }
 
/* Do not flood to ports that enable proxy ARP */
if (p->flags & BR_PROXYARP)
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 6d273ca..b436ea0 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -362,7 +362,7 @@ static struct net_bridge_port *new_nbp(struct net_bridge 
*br,
p->path_cost = port_cost(dev);
p->priority = 0x8000 >> BR_PORT_BITS;
p->port_no = index;
-   p->flags = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD;
+   p->flags = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD;
br_init_port(p);
br_set_state(p, BR_STATE_DISABLED);
br_stp_port_timer_init(p);
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 6509864..a572db71 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -189,6 +189,8 @@ static int br_port_fill_attrs(struct sk_buff *skb,
   !!(p->flags & BR_FLOOD)) ||
nla_put_u8(skb, IFLA_BRPORT_MCAST_FLOOD,
   !!(p->flags & BR_MCAST_FLOOD)) ||
+   nla_put_u8(skb, IFLA_BRPORT_BCAST_FLOOD,
+  !!(p->flags & BR_BCAST_FLOOD)) ||
nla_put_u8(skb, IFLA_BRPORT_PROXYARP, !!(p->flags & BR_PROXYARP)) ||
nla_put_u8(skb, IFLA_BRPORT_PROXYARP_WIFI,
   !!(p->flags & BR_PROXYARP_WIFI)) ||
@@ -683,6 +685,7 @@ static int br_setport(struct net_bridge_port *p, struct 
nlattr *tb[])
br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD);
br

Re: [PATCH] net: bridge: suppress broadcast when multicast flood is disabled

2017-04-25 Thread Mike Manning

On 24/04/17 20:52, Nikolay Aleksandrov wrote:
> On 24/04/17 17:09, Mike Manning wrote:
>> Flood suppression for packets that are not unicast needs to be handled
>> consistently by also not flooding broadcast packets. As broadcast is a
>> special case of multicast, the same kernel parameter should be used to
>> suppress flooding for both of these packet types.
>>
>> Fixes: b6cb5ac8331b ("net: bridge: add per-port multicast flood flag")
>> Cc: Nikolay Aleksandrov <niko...@cumulusnetworks.com>
>> Signed-off-by: Mike Manning <mmann...@brocade.com>
>> ---
>>  net/bridge/br_forward.c | 17 ++---
>>  1 file changed, 10 insertions(+), 7 deletions(-)
>>
> 
> I do not agree that this is a bug fix, the behaviour was intentional and is 
> close to how HW
> handles this flag. It has been like that for a few releases and changing it 
> may impact setups
> that use the flag since up until now they've seen the broadcast but not 
> multicast packets and
> suddenly their broadcast will stop.
> 
> I think it would be better to introduce a third flag for bcast in net-next 
> and use that to
> filter it since that would give us the ability to program HW that can 
> distinguish these
> and have both options available, moreover it will not break any user setups 
> relying on
> the current flag behaviour and we have such setups.
> 
> Thanks,
>  Nik
> 
> 

Hi Nik,
What is the usecase for flooding broadcast but not multicast please? Is the 
lack of flood
suppression for broadcast just something that has not been explicitly tested 
for in those
setups? This is the case for us, the bug raised only at this stage of the 
release cycle.
While adding another kernel param is an option, I would only do so if 
absolutely necessary
so as to avoid proliferation of params. Also to justify adding such a flag for 
broadcast
suppression, I would need to add a comment to explain that while broadcast is a 
subset of
multicast, the multicast flood suppression flag excludes broadcast.

Thanks
Mike

[PATCH] net: bridge: suppress broadcast when multicast flood is disabled

2017-04-24 Thread Mike Manning

Flood suppression for packets that are not unicast needs to be handled
consistently by also not flooding broadcast packets. As broadcast is a
special case of multicast, the same kernel parameter should be used to
suppress flooding for both of these packet types.

Fixes: b6cb5ac8331b ("net: bridge: add per-port multicast flood flag")
Cc: Nikolay Aleksandrov <niko...@cumulusnetworks.com>
Signed-off-by: Mike Manning <mmann...@brocade.com>
---
 net/bridge/br_forward.c | 17 ++---
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 902af6b..a61c7ad 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -183,13 +183,16 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
struct net_bridge_port *p;
 
list_for_each_entry_rcu(p, >port_list, list) {
-   /* Do not flood unicast traffic to ports that turn it off */
-   if (pkt_type == BR_PKT_UNICAST && !(p->flags & BR_FLOOD))
-   continue;
-   /* Do not flood if mc off, except for traffic we originate */
-   if (pkt_type == BR_PKT_MULTICAST &&
-   !(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev)
-   continue;
+   /* Do not flood unicast traffic to ports that turn it off, nor
+* other traffic if mc flood off except for traffic we originate
+*/
+   if (pkt_type == BR_PKT_UNICAST) {
+   if (!(p->flags & BR_FLOOD))
+   continue;
+   } else {
+   if (!(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev)
+   continue;
+   }
 
/* Do not flood to ports that enable proxy ARP */
if (p->flags & BR_PROXYARP)
-- 
2.1.4

Re: [PATCH] net: bridge: allow IPv6 when multicast flood is disabled

2017-03-01 Thread Mike Manning

On 28/02/17 09:20, Nikolay Aleksandrov wrote:
> We are aware of this and have discussed it, but I’m not sure this is the best 
> way to fix it,
> it will still allow local IPv4 mcast to be flooded on all ports even with 
> that flag removed and
> that definitely changes user-visible behaviour (even if it is okay) and will 
> not be appropriate
> for -net.
> 
> Let me get back to you on this one.
> 
> Thanks,
>  Nik
> 
Thanks for your comments, I have sent a v2 patch accordingly in case you have 
no better suggestion.
We need per-port disabling of multicast flooding, but have to apply this patch 
to allow IPv6
connectivity so as to make it usable. There is no noteworthy impact on IPv4 as 
the fix only allows
packets originated by the device. As this feature is new to the 4.9 kernel, 
there are no backwards
compatibility issues with prior kernel versions if this fix is also applied to 
the 4.9 kernel.

[PATCH v2] net: bridge: allow IPv6 when multicast flood is disabled

2017-03-01 Thread Mike Manning

Even with multicast flooding turned off, IPv6 ND should still work so
that IPv6 connectivity is provided. Allow this by continuing to flood
multicast traffic originated by us.

Fixes: b6cb5ac8331b ("net: bridge: add per-port multicast flood flag")
Cc: Nikolay Aleksandrov <niko...@cumulusnetworks.com>
Signed-off-by: Mike Manning <mmann...@brocade.com>
---
 net/bridge/br_forward.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 6bfac29..902af6b 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -186,8 +186,9 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
/* Do not flood unicast traffic to ports that turn it off */
if (pkt_type == BR_PKT_UNICAST && !(p->flags & BR_FLOOD))
continue;
+   /* Do not flood if mc off, except for traffic we originate */
if (pkt_type == BR_PKT_MULTICAST &&
-   !(p->flags & BR_MCAST_FLOOD))
+   !(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev)
continue;
 
/* Do not flood to ports that enable proxy ARP */
-- 
2.1.4

[PATCH] net: bridge: allow IPv6 when multicast flood is disabled

2017-02-27 Thread Mike Manning

Even with multicast flooding turned off, IPv6 ND should still work so
that IPv6 connectivity is provided. Allow this by continuing to flood
multicast traffic originated by us. And similar to the unicast case,
set auto-mask if the multicast flood flag is set.

Fixes: b6cb5ac8331b ("net: bridge: add per-port multicast flood flag")
Cc: Nikolay Aleksandrov <niko...@cumulusnetworks.com>
Signed-off-by: Mike Manning <mmann...@brocade.com>
---
 include/linux/if_bridge.h | 2 +-
 net/bridge/br_forward.c   | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index c5847dc..7731808 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -40,12 +40,12 @@ struct br_ip_list {
 #define BR_ADMIN_COST  BIT(4)
 #define BR_LEARNINGBIT(5)
 #define BR_FLOOD   BIT(6)
-#define BR_AUTO_MASK   (BR_FLOOD | BR_LEARNING)
 #define BR_PROMISC BIT(7)
 #define BR_PROXYARPBIT(8)
 #define BR_LEARNING_SYNC   BIT(9)
 #define BR_PROXYARP_WIFI   BIT(10)
 #define BR_MCAST_FLOOD BIT(11)
+#define BR_AUTO_MASK   (BR_FLOOD | BR_LEARNING | BR_MCAST_FLOOD)
 #define BR_MULTICAST_TO_UNICASTBIT(12)
 #define BR_VLAN_TUNNEL BIT(13)
 
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 6bfac29..7fe7d58 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -186,8 +186,9 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
/* Do not flood unicast traffic to ports that turn it off */
if (pkt_type == BR_PKT_UNICAST && !(p->flags & BR_FLOOD))
continue;
+   /* Do not flood if mc off, except for traffic we originate */
if (pkt_type == BR_PKT_MULTICAST &&
-   !(p->flags & BR_MCAST_FLOOD))
+   !(p->flags & BR_MCAST_FLOOD) && (skb->dev != br->dev))
continue;
 
/* Do not flood to ports that enable proxy ARP */
-- 
2.1.4

Re: [PATCH] net: ipv6: avoid errors due to per-cpu atomic alloc

2016-11-22 Thread Mike Manning

On 11/22/2016 12:18 PM, Hannes Frederic Sowa wrote:
> On 22.11.2016 11:34, Mike Manning wrote:
>> Bursts of failures may occur when adding IPv6 routes via Netlink to the
>> kernel when testing under scale (e.g. 500 routes lost out of 1M). The
>> reason is that percpu.c:pcpu_balance_workfn() is not guaranteed to have
>> extended the area map in time for the atomic allocation using percpu.c:
>> pcpu_alloc() to succeed. This results in route additions failing with
>> an -ENOMEM error.
>>
>> While the sender of the Netlink msg to add this route could check for
>> an ACK and retransmit in the case of an -ENOMEM error, the latter
>> should not occur in the first place if there is plenty of memory. The
>> solution is to use non-atomic alloc for rt6_info instead. While the
>> client may now be blocked for longer depending on the state of the
>> chunk being added to, this work has to be incurred at some point.
>>
>> The alternative solution would be to provide configurable parameters
>> e.g. via sysctl in percpu.c for default map size, low/high empty pages
>> and map margins. For this solution, the map margin sizes need to be
>> stored per chunk, as large margins cannot be used if the dynamic early
>> slots map size is in use. This is not a preferred solution though, as
>> it requires tuning of these parameters to provide sufficient margins to
>> avoid -ENOMEM errors depending on system requirements.
>>
>> Signed-off-by: Mike Manning <mmann...@brocade.com>
>> ---
>>  net/ipv6/route.c |2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
>> index 1b57e11..0e9bb76 100644
>> --- a/net/ipv6/route.c
>> +++ b/net/ipv6/route.c
>> @@ -347,7 +347,7 @@ struct rt6_info *ip6_dst_alloc(struct net *net,
>>  struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
>>  
>>  if (rt) {
>> -rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
>> +rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_KERNEL);
>>  if (rt->rt6i_pcpu) {
>>  int cpu;
> 
> Nak, this doesn't work, as ip6_dst_alloc must be callable from
> non-blocking code paths unfortunately.
> 
> 

Thanks for the prompt reply.

Do you consider the alternative of providing configurable parameters for per-cpu
alloc as viable, or is there a better way of dealing with this?

While I have tested such param changes under scale as avoiding the -ENOMEM 
errors, it
would be good to get confirmation that this approach is acceptable prior to 
coding the
sysctl handling for these.

[PATCH] net: ipv6: avoid errors due to per-cpu atomic alloc

2016-11-22 Thread Mike Manning

Bursts of failures may occur when adding IPv6 routes via Netlink to the
kernel when testing under scale (e.g. 500 routes lost out of 1M). The
reason is that percpu.c:pcpu_balance_workfn() is not guaranteed to have
extended the area map in time for the atomic allocation using percpu.c:
pcpu_alloc() to succeed. This results in route additions failing with
an -ENOMEM error.

While the sender of the Netlink msg to add this route could check for
an ACK and retransmit in the case of an -ENOMEM error, the latter
should not occur in the first place if there is plenty of memory. The
solution is to use non-atomic alloc for rt6_info instead. While the
client may now be blocked for longer depending on the state of the
chunk being added to, this work has to be incurred at some point.

The alternative solution would be to provide configurable parameters
e.g. via sysctl in percpu.c for default map size, low/high empty pages
and map margins. For this solution, the map margin sizes need to be
stored per chunk, as large margins cannot be used if the dynamic early
slots map size is in use. This is not a preferred solution though, as
it requires tuning of these parameters to provide sufficient margins to
avoid -ENOMEM errors depending on system requirements.

Signed-off-by: Mike Manning <mmann...@brocade.com>
---
 net/ipv6/route.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1b57e11..0e9bb76 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -347,7 +347,7 @@ struct rt6_info *ip6_dst_alloc(struct net *net,
struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
 
if (rt) {
-   rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
+   rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_KERNEL);
if (rt->rt6i_pcpu) {
int cpu;
 
-- 
1.7.10.4

Re: [PATCH] net: ipv6: Disable forwarding per interface via sysctl

2016-09-16 Thread Mike Manning

On 09/16/2016 04:46 PM, Hannes Frederic Sowa wrote:
> On 16.09.2016 15:39, Eric Dumazet wrote:
>> On Fri, 2016-09-16 at 13:47 +0100, Mike Manning wrote:
>>> Disabling forwarding per interface via sysctl continues to allow
>>> forwarding. This is contrary to the sysctl documentation stating that
>>> the forwarding sysctl is per interface, whereas currently it is only
>>> the sysctl for all interfaces that has an effect on forwarding. The
>>> solution is to drop any received packets instead of forwarding them
>>> if the ingress device has a per-device forwarding sysctl that is unset.
>>
>> Some archaeological research might be needed because
>> Documentation/networking/ip-sysctl.txt states :
>>
>> IPv4 and IPv6 work differently here; e.g. netfilter must be used
>> to control which interfaces may forward packets and which not.
>>
>> If this netfilter requirement is obsolete, then your patch would need to
>> change the doc as well.
>>
>> Hannes can probably comment on this ?
> 
> Yep, thanks.
> 
> This commit breaks a very common setup: people globally enabled
> forwarding but disabled the forwarding knob on one special interface to
> allow this interface to participate in auto configuration from their
> provider while still forwarding packets over this interface.
> 
> I fear this is so common that this would be a uapi violation.
> 
> Thanks,
> Hannes
> 
> 
Thanks for the use-case, I request to withdraw this patch then.
So configuring an interface on a router to be in host mode is not actually
disabling forwarding in the kernel, it is merely to allow SLAAC. Using ip6tables
for the purpose of disabling forwarding on an interface if one wants an 
interface
in host mode seems a heavyweight solution to work around this. If anyone has
any better suggestions, please let me know.

[PATCH] net: ipv6: Disable forwarding per interface via sysctl

2016-09-16 Thread Mike Manning

Disabling forwarding per interface via sysctl continues to allow
forwarding. This is contrary to the sysctl documentation stating that
the forwarding sysctl is per interface, whereas currently it is only
the sysctl for all interfaces that has an effect on forwarding. The
solution is to drop any received packets instead of forwarding them
if the ingress device has a per-device forwarding sysctl that is unset.

Signed-off-by: Mike Manning <mmann...@brocade.com>
---
 net/ipv6/ip6_output.c |4 
 1 file changed, 4 insertions(+)

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 1dfc402..37cd1d0 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -380,11 +380,15 @@ int ip6_forward(struct sk_buff *skb)
struct ipv6hdr *hdr = ipv6_hdr(skb);
struct inet6_skb_parm *opt = IP6CB(skb);
struct net *net = dev_net(dst->dev);
+   struct inet6_dev *idev = __in6_dev_get(skb->dev);
u32 mtu;
 
if (net->ipv6.devconf_all->forwarding == 0)
goto error;
 
+   if (idev && !idev->cnf.forwarding)
+   goto error;
+
if (skb->pkt_type != PACKET_HOST)
goto drop;
 
-- 
1.7.10.4

[PATCH] net: ipv6: Failure to disable forwarding per interface via sysctl

2016-09-16 Thread Mike Manning

Disabling forwarding per interface via sysctl continues to allow
forwarding. This is contrary to the sysctl documentation stating that
the forwarding sysctl is per interface, whereas currently it is only
the sysctl for all interfaces that has an effect on forwarding. The
solution is to drop any received packets instead of forwarding them
if the ingress device has a per-device forwarding sysctl that is unset.

Signed-off-by: Mike Manning <mmann...@brocade.com>
---
 net/ipv6/ip6_output.c |4 
 1 file changed, 4 insertions(+)

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 1dfc402..37cd1d0 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -380,11 +380,15 @@ int ip6_forward(struct sk_buff *skb)
struct ipv6hdr *hdr = ipv6_hdr(skb);
struct inet6_skb_parm *opt = IP6CB(skb);
struct net *net = dev_net(dst->dev);
+   struct inet6_dev *idev = __in6_dev_get(skb->dev);
u32 mtu;
 
if (net->ipv6.devconf_all->forwarding == 0)
goto error;
 
+   if (idev && !idev->cnf.forwarding)
+   goto error;
+
if (skb->pkt_type != PACKET_HOST)
goto drop;
 
-- 
1.7.10.4

[PATCH v2] net: ipv6: Remove addresses for failures with strict DAD

2016-08-18 Thread Mike Manning

If DAD fails with accept_dad set to 2, global addresses and host routes
are incorrectly left in place. Even though disable_ipv6 is set,
contrary to documentation, the addresses are not dynamically deleted
from the interface. It is only on a subsequent link down/up that these
are removed. The fix is not only to set the disable_ipv6 flag, but
also to call addrconf_ifdown(), which is the action to carry out when
disabling IPv6. This results in the addresses and routes being deleted
immediately. The DAD failure for the LL addr is determined as before
via netlink, or by the absence of the LL addr (which also previously
would have had to be checked for in case of an intervening link down
and up). As the call to addrconf_ifdown() requires an rtnl lock, the
logic to disable IPv6 when DAD fails is moved to addrconf_dad_work().

Previous behavior:

root@vm1:/# sysctl net.ipv6.conf.eth3.accept_dad=2
net.ipv6.conf.eth3.accept_dad = 2
root@vm1:/# ip -6 addr add 2000::10/64 dev eth3
root@vm1:/# ip link set up eth3
root@vm1:/# ip -6 addr show dev eth3
5: eth3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qlen 1000
inet6 2000::10/64 scope global
   valid_lft forever preferred_lft forever
inet6 fe80::5054:ff:fe43:dd5a/64 scope link tentative dadfailed
   valid_lft forever preferred_lft forever
root@vm1:/# ip -6 route show dev eth3
2000::/64  proto kernel  metric 256
fe80::/64  proto kernel  metric 256
root@vm1:/# ip link set down eth3
root@vm1:/# ip link set up eth3
root@vm1:/# ip -6 addr show dev eth3
root@vm1:/# ip -6 route show dev eth3
root@vm1:/#

New behavior:

root@vm1:/# sysctl net.ipv6.conf.eth3.accept_dad=2
net.ipv6.conf.eth3.accept_dad = 2
root@vm1:/# ip -6 addr add 2000::10/64 dev eth3
root@vm1:/# ip link set up eth3
root@vm1:/# ip -6 addr show dev eth3
root@vm1:/# ip -6 route show dev eth3
root@vm1:/#

Signed-off-by: Mike Manning <mmann...@brocade.com>
---
 net/ipv6/addrconf.c |   34 +-
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index df8425f..f418d2e 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1872,7 +1872,6 @@ static int addrconf_dad_end(struct inet6_ifaddr *ifp)
 
 void addrconf_dad_failure(struct inet6_ifaddr *ifp)
 {
-   struct in6_addr addr;
struct inet6_dev *idev = ifp->idev;
struct net *net = dev_net(ifp->idev->dev);
 
@@ -1934,18 +1933,6 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp)
in6_ifa_put(ifp2);
 lock_errdad:
spin_lock_bh(>lock);
-   } else if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6) {
-   addr.s6_addr32[0] = htonl(0xfe80);
-   addr.s6_addr32[1] = 0;
-
-   if (!ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) &&
-   ipv6_addr_equal(>addr, )) {
-   /* DAD failed for link-local based on MAC address */
-   idev->cnf.disable_ipv6 = 1;
-
-   pr_info("%s: IPv6 being disabled!\n",
-   ifp->idev->dev->name);
-   }
}
 
 errdad:
@@ -3821,6 +3808,7 @@ static void addrconf_dad_work(struct work_struct *w)
dad_work);
struct inet6_dev *idev = ifp->idev;
struct in6_addr mcaddr;
+   bool disable_ipv6 = false;
 
enum {
DAD_PROCESS,
@@ -3837,6 +3825,24 @@ static void addrconf_dad_work(struct work_struct *w)
} else if (ifp->state == INET6_IFADDR_STATE_ERRDAD) {
action = DAD_ABORT;
ifp->state = INET6_IFADDR_STATE_POSTDAD;
+
+   if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6 &&
+   !(ifp->flags & IFA_F_STABLE_PRIVACY)) {
+   struct in6_addr addr;
+
+   addr.s6_addr32[0] = htonl(0xfe80);
+   addr.s6_addr32[1] = 0;
+
+   if (!ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) &&
+   ipv6_addr_equal(>addr, )) {
+   /* DAD failed for link-local based on MAC */
+   idev->cnf.disable_ipv6 = 1;
+
+   pr_info("%s: IPv6 being disabled!\n",
+   ifp->idev->dev->name);
+   disable_ipv6 = true;
+   }
+   }
}
spin_unlock_bh(>lock);
 
@@ -3845,6 +3851,8 @@ static void addrconf_dad_work(struct work_struct *w)
goto out;
} else if (action == DAD_ABORT) {
addrconf_dad_stop(ifp, 1);
+   if (disable_ipv6)
+   addrconf_ifdown(idev->dev, 0);
goto out;
}
 
-- 
1.7.10.4

Re: [PATCH] net: ipv6: Remove addresses for failures with strict DAD

2016-08-17 Thread Mike Manning

On 08/17/2016 04:40 PM, Hannes Frederic Sowa wrote:
> On 17.08.2016 12:28, Mike Manning wrote:
>> +static void dev_disable_change(struct inet6_dev *idev);
>>  
>>  /*
>>   *  Configured unicast address hash table
>> @@ -1945,6 +1946,12 @@ lock_errdad:
>>  
>>  pr_info("%s: IPv6 being disabled!\n",
>>  ifp->idev->dev->name);
>> +spin_unlock_bh(>lock);
>> +addrconf_dad_stop(ifp, 1);
>> +rtnl_lock();
>> +dev_disable_change(idev);
>> +rtnl_unlock();
>> +return;
>>  }
>>  }
> 
> You can't take rtnl_lock at that point but must postpone the actions and
> do that in addrconf_dad_work.
> 
> Probably the whole ... else if (idev->cnf.accept_dad > 1 && ...) needs
> to move there.
> 
> Bye,
> Hannes
> 
> 

Thanks for the prompt review, I will look into making these changes.

Also these changes caused a build error due to conditional compilation without 
CONFIG_SYSCTL, which is resolved by replacing the call to 
dev_disable_change(idev) by directly calling addrconf_ifdown(idev->dev, 0) 
instead.

I would appreciate any further comments if the suggested change in behavior is 
not acceptable.

Thanks
Mike

[PATCH] net: ipv6: Remove addresses for failures with strict DAD

2016-08-17 Thread Mike Manning

If DAD fails with accept_dad set to 2, global addresses and host routes
are incorrectly left in place. Even though disable_ipv6 is set,
contrary to documentation, the addresses are not dynamically deleted
from the interface. It is only on a subsequent link down/up that these
are removed. The fix is not only to set the disable_ipv6 flag, but to
actually disable IPv6 when DAD fails in this case. This results in the
addresses and routes being immediately deleted. The DAD failure
for the LL addr is determined as before via netlink, or by the absence
of the LL addr (which also previously would have had to be checked for
in case of an intervening link down/up).

Previous behavior:

root@vm1:/# sysctl net.ipv6.conf.eth3.accept_dad=2
net.ipv6.conf.eth3.accept_dad = 2
root@vm1:/# ip -6 addr add 2000::10/64 dev eth3
root@vm1:/# ip link set up eth3
root@vm1:/# ip -6 addr show dev eth3
5: eth3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qlen 1000
inet6 2000::10/64 scope global
   valid_lft forever preferred_lft forever
inet6 fe80::5054:ff:fe43:dd5a/64 scope link tentative dadfailed
   valid_lft forever preferred_lft forever
root@vm1:/# ip -6 route show dev eth3
2000::/64  proto kernel  metric 256
fe80::/64  proto kernel  metric 256
root@vm1:/# ip link set down eth3
root@vm1:/# ip link set up eth3
root@vm1:/# ip -6 addr show dev eth3
root@vm1:/# ip -6 route show dev eth3
root@vm1:/#

New behavior:

root@vm1:/# sysctl net.ipv6.conf.eth3.accept_dad=2
net.ipv6.conf.eth3.accept_dad = 2
root@vm1:/# ip -6 addr add 2000::10/64 dev eth3
root@vm1:/# ip link set up eth3
root@vm1:/# ip -6 addr show dev eth3
root@vm1:/# ip -6 route show dev eth3
root@vm1:/#

Signed-off-by: Mike Manning <mmann...@brocade.com>
---
 net/ipv6/addrconf.c |7 +++
 1 file changed, 7 insertions(+)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index df8425f..6be5a95 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -135,6 +135,7 @@ static int ipv6_count_addresses(struct inet6_dev *idev);
 static int ipv6_generate_stable_address(struct in6_addr *addr,
u8 dad_count,
const struct inet6_dev *idev);
+static void dev_disable_change(struct inet6_dev *idev);
 
 /*
  * Configured unicast address hash table
@@ -1945,6 +1946,12 @@ lock_errdad:
 
pr_info("%s: IPv6 being disabled!\n",
ifp->idev->dev->name);
+   spin_unlock_bh(>lock);
+   addrconf_dad_stop(ifp, 1);
+   rtnl_lock();
+   dev_disable_change(idev);
+   rtnl_unlock();
+   return;
}
}
 
-- 
1.7.10.4

[PATCH] net: ipv6: Do not keep IPv6 addresses when IPv6 is disabled

2016-08-12 Thread Mike Manning

If IPv6 is disabled when the option is set to keep IPv6
addresses on link down, userspace is unaware of this as
there is no such indication via netlink. The solution is to
remove the IPv6 addresses in this case, which results in
netlink messages indicating removal of addresses in the
usual manner. This fix also makes the behavior consistent
with the case of having IPv6 disabled first, which stops
IPv6 addresses from being added.

Fixes: f1705ec197e7 ("net: ipv6: Make address flushing on ifdown optional")
Signed-off-by: Mike Manning <mmann...@brocade.com>
---
 net/ipv6/addrconf.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index ab3e796..df8425f 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3543,7 +3543,7 @@ static int addrconf_ifdown(struct net_device *dev, int 
how)
/* combine the user config with event to determine if permanent
 * addresses are to be removed from address hash table
 */
-   keep_addr = !(how || _keep_addr <= 0);
+   keep_addr = !(how || _keep_addr <= 0 || idev->cnf.disable_ipv6);
 
/* Step 2: clear hash table */
for (i = 0; i < IN6_ADDR_HSIZE; i++) {
@@ -3599,7 +3599,7 @@ restart:
/* re-combine the user config with event to determine if permanent
 * addresses are to be removed from the interface list
 */
-   keep_addr = (!how && _keep_addr > 0);
+   keep_addr = (!how && _keep_addr > 0 && !idev->cnf.disable_ipv6);
 
INIT_LIST_HEAD(_list);
list_for_each_entry_safe(ifa, tmp, >addr_list, if_list) {
-- 
1.7.10.4

[PATCH v2] net: ipv6: Always leave anycast and multicast groups on link down

2016-07-22 Thread Mike Manning

Default kernel behavior is to delete IPv6 addresses on link
down, which entails deletion of the multicast and the
subnet-router anycast addresses. These deletions do not
happen with sysctl setting to keep global IPv6 addresses on
link down, so every link down/up causes an increment of the
anycast and multicast refcounts. These bogus refcounts may
stop these addrs from being removed on subsequent calls to
delete them. The solution is to leave the groups for the
multicast and subnet anycast on link down for the callflow
when global IPv6 addresses are kept.

Fixes: f1705ec197e7 ("net: ipv6: Make address flushing on ifdown optional")
Signed-off-by: Mike Manning <mmann...@brocade.com>
---
 net/ipv6/addrconf.c |4 
 1 file changed, 4 insertions(+)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 47f837a..047c75a 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3562,6 +3562,10 @@ restart:
if (state != INET6_IFADDR_STATE_DEAD) {
__ipv6_ifa_notify(RTM_DELADDR, ifa);
inet6addr_notifier_call_chain(NETDEV_DOWN, ifa);
+   } else {
+   if (idev->cnf.forwarding)
+   addrconf_leave_anycast(ifa);
+   addrconf_leave_solict(ifa->idev, >addr);
}
 
write_lock_bh(>lock);
-- 
1.7.10.4

[PATCH] net: ipv6: Always leave subnet anycast group on link down

2016-07-21 Thread Mike Manning


Default kernel behavior is to delete IPv6 addresses on link
down, which entails deletion of the address-derived
subnet-router anycast address. The latter does not happen
with sysctl setting to keep global IPv6 addrs on link down,
so every link down/up causes an increment of the anycast
refcount, cf aca_users in __ipv6_dev_ac_inc(). This bogus
refcount stops the anycast being removed on subsequent
calls to delete the address. The solution is to leave the
group for this subnet anycast on link down also for the
callflow when global IPv6 addresses are kept.

Fixes: f1705ec197e7 ("net: ipv6: Make address flushing on ifdown optional")
Signed-off-by: Mike Manning <mmann...@brocade.com>
---
 net/ipv6/addrconf.c |2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 47f837a..3c69e56 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3562,6 +3562,8 @@ restart:
if (state != INET6_IFADDR_STATE_DEAD) {
__ipv6_ifa_notify(RTM_DELADDR, ifa);
inet6addr_notifier_call_chain(NETDEV_DOWN, ifa);
+   } else if (idev->cnf.forwarding) {
+   addrconf_leave_anycast(ifa);
}
 
write_lock_bh(>lock);
-- 
1.7.10.4

[PATCH net v5] vlan: Propagate MAC address to VLANs

2016-05-27 Thread Mike Manning

The MAC address of the physical interface is only copied to the VLAN
when it is first created, resulting in an inconsistency after MAC
address changes of only newly created VLANs having an up-to-date MAC.

The VLANs should continue inheriting the MAC address of the physical
interface until the VLAN MAC address is explicitly set to any value. 
This allows IPv6 EUI64 addresses for the VLAN to reflect any changes
to the MAC of the physical interface and thus for DAD to behave as
expected.

Signed-off-by: Mike Manning <mmann...@brocade.com>
---
 net/8021q/vlan.c |5 +
 net/8021q/vlan.h |2 ++
 net/8021q/vlan_dev.c |   20 +---
 3 files changed, 24 insertions(+), 3 deletions(-)

--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -291,6 +291,10 @@ static void vlan_sync_address(struct net
if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr))
return;
 
+   /* vlan continues to inherit address of lower device */
+   if (vlan_dev_inherit_address(vlandev, dev))
+   goto out;
+
/* vlan address was different from the old address and is equal to
 * the new address */
if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) &&
@@ -303,6 +307,7 @@ static void vlan_sync_address(struct net
!ether_addr_equal(vlandev->dev_addr, dev->dev_addr))
dev_uc_add(dev, vlandev->dev_addr);
 
+out:
ether_addr_copy(vlan->real_dev_addr, dev->dev_addr);
 }
 
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -109,6 +109,8 @@ int vlan_check_real_dev(struct net_devic
 void vlan_setup(struct net_device *dev);
 int register_vlan_dev(struct net_device *dev);
 void unregister_vlan_dev(struct net_device *dev, struct list_head *head);
+bool vlan_dev_inherit_address(struct net_device *dev,
+ struct net_device *real_dev);
 
 static inline u32 vlan_get_ingress_priority(struct net_device *dev,
u16 vlan_tci)
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -244,6 +244,17 @@ void vlan_dev_get_realdev_name(const str
strncpy(result, vlan_dev_priv(dev)->real_dev->name, 23);
 }
 
+bool vlan_dev_inherit_address(struct net_device *dev,
+ struct net_device *real_dev)
+{
+   if (dev->addr_assign_type != NET_ADDR_STOLEN)
+   return false;
+
+   ether_addr_copy(dev->dev_addr, real_dev->dev_addr);
+   call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+   return true;
+}
+
 static int vlan_dev_open(struct net_device *dev)
 {
struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
@@ -254,7 +265,8 @@ static int vlan_dev_open(struct net_devi
!(vlan->flags & VLAN_FLAG_LOOSE_BINDING))
return -ENETDOWN;
 
-   if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr)) {
+   if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr) &&
+   !vlan_dev_inherit_address(dev, real_dev)) {
err = dev_uc_add(real_dev, dev->dev_addr);
if (err < 0)
goto out;
@@ -558,8 +570,10 @@ static int vlan_dev_init(struct net_devi
/* ipv6 shared card related stuff */
dev->dev_id = real_dev->dev_id;
 
-   if (is_zero_ether_addr(dev->dev_addr))
-   eth_hw_addr_inherit(dev, real_dev);
+   if (is_zero_ether_addr(dev->dev_addr)) {
+   ether_addr_copy(dev->dev_addr, real_dev->dev_addr);
+   dev->addr_assign_type = NET_ADDR_STOLEN;
+   }
if (is_zero_ether_addr(dev->broadcast))
memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len);
 
-- 
1.7.10.4

Re: [PATCH net v4] vlan: Propagate MAC address to VLANs

2016-05-09 Thread Mike Manning

On 05/09/2016 07:48 AM, Michal Kubecek wrote:
> On Sat, May 07, 2016 at 11:00:09AM +0100, Mike Manning wrote:
>> The MAC address of the physical interface is only copied to the VLAN
>> when it is first created, resulting in an inconsistency after MAC
>> address changes of only newly created VLANs having an up-to-date MAC.
>>
>> The VLANs should continue inheriting the MAC address of the physical
>> interface until the VLAN MAC address is explicitly set to any value. 
>> This allows IPv6 EUI64 addresses for the VLAN to reflect any changes
>> to the MAC of the physical interface and thus for DAD to behave as
>> expected.
>>
>> Signed-off-by: Mike Manning <mmann...@brocade.com>
>> ---
>>  net/8021q/vlan.c |7 +++
>>  net/8021q/vlan_dev.c |   14 ++
>>  2 files changed, 17 insertions(+), 4 deletions(-)
>>
>> --- a/net/8021q/vlan.c
>> +++ b/net/8021q/vlan.c
>> @@ -291,6 +291,12 @@ static void vlan_sync_address(struct net
>>  if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr))
>>  return;
>>  
>> +/* vlan continues to inherit address of parent interface */
>> +if (vlandev->addr_assign_type == NET_ADDR_STOLEN) {
>> +ether_addr_copy(vlandev->dev_addr, dev->dev_addr);
>> +goto out;
>> +}
>> +
> 
> I might have missed something in the previous discussion but as
> ether_addr_copy() is just an optimized memcpy(), how is this going to
> handle the setups where the vlan device itself has an upper device? For
> example,
> 
>   - if it is a bridge port, how is the bridge going to learn about its
> address change so that it can update its FDB?
>   - if it is a bond slave or team port, current code preserves the vlan
> device address on real device change so everything is fine; your
> proposal would change vlan device's address without bond being even
> notified, I believe
>   - there might be a macvlan on top of the vlan and you might
> accidentally match its address with the new one
>

Thanks for your review and this excellent catch. I will add address
notification for the vlan itself and test appropriately for when an upper
device is present.

>>  /* vlan address was different from the old address and is equal to
>>   * the new address */
>>  if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) &&
>> @@ -303,6 +309,7 @@ static void vlan_sync_address(struct net
>>  !ether_addr_equal(vlandev->dev_addr, dev->dev_addr))
>>  dev_uc_add(dev, vlandev->dev_addr);
>>  
>> +out:
>>  ether_addr_copy(vlan->real_dev_addr, dev->dev_addr);
>>  }
>>  
>> --- a/net/8021q/vlan_dev.c
>> +++ b/net/8021q/vlan_dev.c
>> @@ -255,9 +255,13 @@ static int vlan_dev_open(struct net_devi
>>  return -ENETDOWN;
>>  
>>  if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr)) {
>> -err = dev_uc_add(real_dev, dev->dev_addr);
>> -if (err < 0)
>> -goto out;
>> +if (dev->addr_assign_type == NET_ADDR_STOLEN) {
>> +ether_addr_copy(dev->dev_addr, real_dev->dev_addr);
> 
> The same question here.
> 
>> +} else {
>> +err = dev_uc_add(real_dev, dev->dev_addr);
>> +if (err < 0)
>> +goto out;
>> +}
>>  }
>>  
>>  if (dev->flags & IFF_ALLMULTI) {
>> @@ -558,8 +562,10 @@ static int vlan_dev_init(struct net_devi
>>  /* ipv6 shared card related stuff */
>>  dev->dev_id = real_dev->dev_id;
>>  
>> -if (is_zero_ether_addr(dev->dev_addr))
>> +if (is_zero_ether_addr(dev->dev_addr)) {
>>  eth_hw_addr_inherit(dev, real_dev);
>> +dev->addr_assign_type = NET_ADDR_STOLEN;
> 
> You might want to replace eth_hw_addr_inherit() with ether_addr_copy()
> here as they only differ in the former copying addr_assign_type which
> you are going to rewrite anyway. (But as both are most likely inlined,
> I would expect the resulting code to be the same in the end.)
> 
>Michal Kubecek
>

Thanks. Yes, I was aware of this but decided not to change it to keep the
changeset to a minimum. I will make the change as recommended.
 
>> +}
>>  if (is_zero_ether_addr(dev->broadcast))
>>  memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len);
>>
>> -- 
>> 1.7.10.4
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>

[PATCH net v4] vlan: Propagate MAC address to VLANs

2016-05-07 Thread Mike Manning

The MAC address of the physical interface is only copied to the VLAN
when it is first created, resulting in an inconsistency after MAC
address changes of only newly created VLANs having an up-to-date MAC.

The VLANs should continue inheriting the MAC address of the physical
interface until the VLAN MAC address is explicitly set to any value. 
This allows IPv6 EUI64 addresses for the VLAN to reflect any changes
to the MAC of the physical interface and thus for DAD to behave as
expected.

Signed-off-by: Mike Manning <mmann...@brocade.com>
---
 net/8021q/vlan.c |7 +++
 net/8021q/vlan_dev.c |   14 ++
 2 files changed, 17 insertions(+), 4 deletions(-)

--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -291,6 +291,12 @@ static void vlan_sync_address(struct net
if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr))
return;
 
+   /* vlan continues to inherit address of parent interface */
+   if (vlandev->addr_assign_type == NET_ADDR_STOLEN) {
+   ether_addr_copy(vlandev->dev_addr, dev->dev_addr);
+   goto out;
+   }
+
/* vlan address was different from the old address and is equal to
 * the new address */
if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) &&
@@ -303,6 +309,7 @@ static void vlan_sync_address(struct net
!ether_addr_equal(vlandev->dev_addr, dev->dev_addr))
dev_uc_add(dev, vlandev->dev_addr);
 
+out:
ether_addr_copy(vlan->real_dev_addr, dev->dev_addr);
 }
 
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -255,9 +255,13 @@ static int vlan_dev_open(struct net_devi
return -ENETDOWN;
 
if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr)) {
-   err = dev_uc_add(real_dev, dev->dev_addr);
-   if (err < 0)
-   goto out;
+   if (dev->addr_assign_type == NET_ADDR_STOLEN) {
+   ether_addr_copy(dev->dev_addr, real_dev->dev_addr);
+   } else {
+   err = dev_uc_add(real_dev, dev->dev_addr);
+   if (err < 0)
+   goto out;
+   }
}
 
if (dev->flags & IFF_ALLMULTI) {
@@ -558,8 +562,10 @@ static int vlan_dev_init(struct net_devi
/* ipv6 shared card related stuff */
dev->dev_id = real_dev->dev_id;
 
-   if (is_zero_ether_addr(dev->dev_addr))
+   if (is_zero_ether_addr(dev->dev_addr)) {
eth_hw_addr_inherit(dev, real_dev);
+   dev->addr_assign_type = NET_ADDR_STOLEN;
+   }
if (is_zero_ether_addr(dev->broadcast))
memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len);

-- 
1.7.10.4

Re: [PATCH net v3] vlan: Propagate MAC address to VLANs

2016-05-07 Thread Mike Manning

On 05/06/2016 08:48 PM, Alexander Duyck wrote:
> On Fri, May 6, 2016 at 12:36 PM, Mike Manning <mmann...@brocade.com> wrote:
>> On 05/06/2016 06:02 PM, Alexander Duyck wrote:
>>> On Fri, May 6, 2016 at 6:26 AM, Mike Manning <mmann...@brocade.com> wrote:
>>>> The MAC address of the physical interface is only copied to the VLAN
>>>> when it is first created, resulting in an inconsistency after MAC
>>>> address changes of only newly created VLANs having an up-to-date MAC.
>>>>
>>>> The VLANs should continue inheriting the MAC address of the physical
>>>> interface, unless explicitly changed to be different from this.
>>>> This allows IPv6 EUI64 addresses for the VLAN to reflect any changes
>>>> to the MAC of the physical interface and thus for DAD to behave as
>>>> expected.
>>>>
>>>> Signed-off-by: Mike Manning <mmann...@brocade.com>
>>>> ---
>>>>  include/linux/if_vlan.h |2 ++
>>>>  net/8021q/vlan.c|   17 +++--
>>>>  net/8021q/vlan_dev.c|   13 ++---
>>>>  3 files changed, 23 insertions(+), 9 deletions(-)
>>>>
>>>> --- a/include/linux/if_vlan.h
>>>> +++ b/include/linux/if_vlan.h
>>>> @@ -138,6 +138,7 @@ struct netpoll;
>>>>   * @flags: device flags
>>>>   * @real_dev: underlying netdevice
>>>>   * @real_dev_addr: address of underlying netdevice
>>>> + * @addr_assign_type: address assignment type
>>>>   * @dent: proc dir entry
>>>>   * @vlan_pcpu_stats: ptr to percpu rx stats
>>>>   */
>>>> @@ -153,6 +154,7 @@ struct vlan_dev_priv {
>>>>
>>>> struct net_device   *real_dev;
>>>> unsigned char   real_dev_addr[ETH_ALEN];
>>>> +   unsigned char   addr_assign_type;
>>>>
>>>> struct proc_dir_entry   *dent;
>>>> struct vlan_pcpu_stats __percpu *vlan_pcpu_stats;
>>>
>>> Please don't start adding new members to structures when it already
>>> exists in the net_device.  If anything you should be able to drop
>>> read_dev_addr if you do this correctly because you shouldn't need to
>>> clone the lower dev address to watch for changes.  All you will need
>>> to do is watch NET_ADDR_STOLEN.
>>>
>>
>> Thanks for the detailed review. I had initially used the existing type
>> in net_device, but the problem with this was that it got overwritten to
>> NET_ADDR_SET in dev_set_mac_address(), which I was reluctant to modify.
>> It would just be a case of setting the type earlier in that function
>> (and caching the previous value in case there is an error).
>>
>> However, based on your later comment, it seems I should not bother with
>> the approach I have here, namely that if the VLAN MAC is set to the same
>> value as that of the lower device MAC, that is to be considered as
>> resetting it and thus for MAC inheritance to resume. Instead, I will just
>> make this a 1-shot transition, i.e. the VLAN MAC starts off as inherited,
>> and if it is set to anything (even the value of the lower device MAC),
>> inheritance is stopped. I agree this makes for a far simpler changeset.
>>
>> I don't think I can remove real_dev_addr, as that is still needed for
>> the existing functionality in vlan_sync_address() to determine if the sync
>> should be done, also as a way of caching it for handling in vlan_dev_open().
> 
> The thing is that logic isn't really needed anymore though if you are
> going to be following the lower dev.  If you follow the code what it
> is doing is adding the address via dev_uc_add if the lower address
> moves away from the VLAN address.  With your changes you are updating
> the VLAN MAC address to the lower value in the NET_ADDR_STOLEN case so
> you don't need to add or remove an extra unicast address.  If the user
> sets the MAC address you can then use the vlandev->dev_addr as the
> address you add/remove from the unicast list and you probably don't
> need to bother with tracking the lower device state anyway.
>

I agree that this logic is not needed at all for the NET_ADDR_STOLEN case.
However, once the VLAN MAC has been explicitly set, the situation reverts
to the existing functionality, whereby real_dev_addr is used to ensure that
dev_uc_add/del are not incorrectly called multiple times for the same MACs.
As an example, if the lower device MAC is different from the V

Re: [PATCH net v3] vlan: Propagate MAC address to VLANs

2016-05-06 Thread Mike Manning

On 05/06/2016 06:02 PM, Alexander Duyck wrote:
> On Fri, May 6, 2016 at 6:26 AM, Mike Manning <mmann...@brocade.com> wrote:
>> The MAC address of the physical interface is only copied to the VLAN
>> when it is first created, resulting in an inconsistency after MAC
>> address changes of only newly created VLANs having an up-to-date MAC.
>>
>> The VLANs should continue inheriting the MAC address of the physical
>> interface, unless explicitly changed to be different from this.
>> This allows IPv6 EUI64 addresses for the VLAN to reflect any changes
>> to the MAC of the physical interface and thus for DAD to behave as
>> expected.
>>
>> Signed-off-by: Mike Manning <mmann...@brocade.com>
>> ---
>>  include/linux/if_vlan.h |2 ++
>>  net/8021q/vlan.c|   17 +++--
>>  net/8021q/vlan_dev.c|   13 ++---
>>  3 files changed, 23 insertions(+), 9 deletions(-)
>>
>> --- a/include/linux/if_vlan.h
>> +++ b/include/linux/if_vlan.h
>> @@ -138,6 +138,7 @@ struct netpoll;
>>   * @flags: device flags
>>   * @real_dev: underlying netdevice
>>   * @real_dev_addr: address of underlying netdevice
>> + * @addr_assign_type: address assignment type
>>   * @dent: proc dir entry
>>   * @vlan_pcpu_stats: ptr to percpu rx stats
>>   */
>> @@ -153,6 +154,7 @@ struct vlan_dev_priv {
>>
>> struct net_device   *real_dev;
>> unsigned char   real_dev_addr[ETH_ALEN];
>> +   unsigned char   addr_assign_type;
>>
>> struct proc_dir_entry   *dent;
>> struct vlan_pcpu_stats __percpu *vlan_pcpu_stats;
> 
> Please don't start adding new members to structures when it already
> exists in the net_device.  If anything you should be able to drop
> read_dev_addr if you do this correctly because you shouldn't need to
> clone the lower dev address to watch for changes.  All you will need
> to do is watch NET_ADDR_STOLEN.
> 

Thanks for the detailed review. I had initially used the existing type
in net_device, but the problem with this was that it got overwritten to
NET_ADDR_SET in dev_set_mac_address(), which I was reluctant to modify.
It would just be a case of setting the type earlier in that function
(and caching the previous value in case there is an error).

However, based on your later comment, it seems I should not bother with
the approach I have here, namely that if the VLAN MAC is set to the same
value as that of the lower device MAC, that is to be considered as
resetting it and thus for MAC inheritance to resume. Instead, I will just
make this a 1-shot transition, i.e. the VLAN MAC starts off as inherited,
and if it is set to anything (even the value of the lower device MAC),
inheritance is stopped. I agree this makes for a far simpler changeset.

I don't think I can remove real_dev_addr, as that is still needed for
the existing functionality in vlan_sync_address() to determine if the sync
should be done, also as a way of caching it for handling in vlan_dev_open().

As a matter of interest, what is the advantage of not updating the VLAN
MAC when it is down? I appreciate that one should not add/delete
secondary unicast addresses in this case, but there is no such 
restriction for copying the MAC.

>> --- a/net/8021q/vlan.c
>> +++ b/net/8021q/vlan.c
>> @@ -291,6 +291,15 @@ static void vlan_sync_address(struct net
>> if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr))
>> return;
>>
>> +   /* vlan continues to inherit address of parent interface */
>> +   if (vlan->addr_assign_type == NET_ADDR_STOLEN) {
>> +   ether_addr_copy(vlandev->dev_addr, dev->dev_addr);
>> +   goto out;
>> +   }
>> +
>> +   if (!(vlandev->flags & IFF_UP))
>> +   goto out;
>> +
>> /* vlan address was different from the old address and is equal to
>>  * the new address */
>> if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) &&
>> @@ -303,6 +312,7 @@ static void vlan_sync_address(struct net
>> !ether_addr_equal(vlandev->dev_addr, dev->dev_addr))
>> dev_uc_add(dev, vlandev->dev_addr);
>>
>> +out:
>> ether_addr_copy(vlan->real_dev_addr, dev->dev_addr);
>>  }
>>
>> @@ -389,13 +399,8 @@ static int vlan_device_event(struct noti
>>
>> case NETDEV_CHANGEADDR:
>> /* Adjust unicast filters on underlying device */
>> -   vlan_group_for_each_dev(g

[PATCH net v3] vlan: Propagate MAC address to VLANs

2016-05-06 Thread Mike Manning

The MAC address of the physical interface is only copied to the VLAN
when it is first created, resulting in an inconsistency after MAC
address changes of only newly created VLANs having an up-to-date MAC.

The VLANs should continue inheriting the MAC address of the physical
interface, unless explicitly changed to be different from this. 
This allows IPv6 EUI64 addresses for the VLAN to reflect any changes
to the MAC of the physical interface and thus for DAD to behave as
expected.

Signed-off-by: Mike Manning <mmann...@brocade.com>
---
 include/linux/if_vlan.h |2 ++
 net/8021q/vlan.c|   17 +++--
 net/8021q/vlan_dev.c|   13 ++---
 3 files changed, 23 insertions(+), 9 deletions(-)

--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -138,6 +138,7 @@ struct netpoll;
  * @flags: device flags
  * @real_dev: underlying netdevice
  * @real_dev_addr: address of underlying netdevice
+ * @addr_assign_type: address assignment type
  * @dent: proc dir entry
  * @vlan_pcpu_stats: ptr to percpu rx stats
  */
@@ -153,6 +154,7 @@ struct vlan_dev_priv {
 
struct net_device   *real_dev;
unsigned char   real_dev_addr[ETH_ALEN];
+   unsigned char   addr_assign_type;
 
struct proc_dir_entry   *dent;
struct vlan_pcpu_stats __percpu *vlan_pcpu_stats;
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -291,6 +291,15 @@ static void vlan_sync_address(struct net
if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr))
return;
 
+   /* vlan continues to inherit address of parent interface */
+   if (vlan->addr_assign_type == NET_ADDR_STOLEN) {
+   ether_addr_copy(vlandev->dev_addr, dev->dev_addr);
+   goto out;
+   }
+
+   if (!(vlandev->flags & IFF_UP))
+   goto out;
+
/* vlan address was different from the old address and is equal to
 * the new address */
if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) &&
@@ -303,6 +312,7 @@ static void vlan_sync_address(struct net
!ether_addr_equal(vlandev->dev_addr, dev->dev_addr))
dev_uc_add(dev, vlandev->dev_addr);
 
+out:
ether_addr_copy(vlan->real_dev_addr, dev->dev_addr);
 }
 
@@ -389,13 +399,8 @@ static int vlan_device_event(struct noti
 
case NETDEV_CHANGEADDR:
/* Adjust unicast filters on underlying device */
-   vlan_group_for_each_dev(grp, i, vlandev) {
-   flgs = vlandev->flags;
-   if (!(flgs & IFF_UP))
-   continue;
-
+   vlan_group_for_each_dev(grp, i, vlandev)
vlan_sync_address(dev, vlandev);
-   }
break;
 
case NETDEV_CHANGEMTU:
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -315,17 +315,21 @@ static int vlan_dev_stop(struct net_devi
 
 static int vlan_dev_set_mac_address(struct net_device *dev, void *p)
 {
-   struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+   struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+   struct net_device *real_dev = vlan->real_dev;
struct sockaddr *addr = p;
+   bool is_real_addr;
int err;
 
if (!is_valid_ether_addr(addr->sa_data))
return -EADDRNOTAVAIL;
 
+   is_real_addr = ether_addr_equal(addr->sa_data, real_dev->dev_addr);
+
if (!(dev->flags & IFF_UP))
goto out;
 
-   if (!ether_addr_equal(addr->sa_data, real_dev->dev_addr)) {
+   if (!is_real_addr) {
err = dev_uc_add(real_dev, addr->sa_data);
if (err < 0)
return err;
@@ -336,6 +340,7 @@ static int vlan_dev_set_mac_address(stru
 
 out:
ether_addr_copy(dev->dev_addr, addr->sa_data);
+   vlan->addr_assign_type = is_real_addr ? NET_ADDR_STOLEN : NET_ADDR_SET;
return 0;
 }
 
@@ -558,8 +563,10 @@ static int vlan_dev_init(struct net_devi
/* ipv6 shared card related stuff */
dev->dev_id = real_dev->dev_id;
 
-   if (is_zero_ether_addr(dev->dev_addr))
+   if (is_zero_ether_addr(dev->dev_addr)) {
eth_hw_addr_inherit(dev, real_dev);
+   vlan_dev_priv(dev)->addr_assign_type = NET_ADDR_STOLEN;
+   }
if (is_zero_ether_addr(dev->broadcast))
memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len);

-- 
1.7.10.4

Re: [PATCH net] vlan: Propagate MAC address changes properly

2016-05-03 Thread Mike Manning

On 05/03/2016 05:16 AM, David Miller wrote:
> From: Mike Manning <mmann...@brocade.com>
> Date: Sat, 30 Apr 2016 11:32:37 +0100
> 
>> The MAC address of the physical interface is only copied to the VLAN
>> when it is first created, resulting in an inconsistency after MAC
>> address changes of only newly created VLANs having an up-to-date MAC.
>>
>> Continuing to inherit the MAC address unless explicitly changed for
>> the VLAN allows IPv6 EUI64 addresses for the VLAN to reflect the change
>> and thus for DAD to behave as expected for the given MAC.
>>
>> Signed-off-by: Mike Manning <mmann...@brocade.com>
> 
> What is this code really trying to achieve?
> 
> Is it "Propagate real device MAC changes to undelying vlan device,
> but not if the user set the vlan MAC explicitly."?

Right, I will update the subject header to make this clearer

> 
> If so, implement that instead of all of these confusing tests.
> 
> If the vlan device's set_mac_address operation is ever called,
> set a boolean value in the vlan device private to true and test
> it here.
> 

Given that this information is implicit in real_dev_addr, I am reluctant
to add another member to the vlan_dev_priv data structure, especially given
that there may be a large number of VLANs. Instead I have added a variable
real_addr_in_use in vlan_sync_address() to make this clearer.

[PATCH net v2] vlan: Propagate MAC address to VLANs unless explicitly set

2016-05-03 Thread Mike Manning

The MAC address of the physical interface is only copied to the VLAN
when it is first created, resulting in an inconsistency after MAC
address changes of only newly created VLANs having an up-to-date MAC.

Continuing to inherit the MAC address unless explicitly changed for
the VLAN allows IPv6 EUI64 addresses for the VLAN to reflect the change
and thus for DAD to behave as expected for the given MAC.

Signed-off-by: Mike Manning <mmann...@brocade.com>
---
 net/8021q/vlan.c |   22 ++
 1 file changed, 10 insertions(+), 12 deletions(-)

--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -286,22 +286,25 @@ static void vlan_sync_address(struct net
  struct net_device *vlandev)
 {
struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev);
+   bool real_addr_in_use;
 
/* May be called without an actual change */
if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr))
return;
 
-   /* vlan address was different from the old address and is equal to
+   real_addr_in_use =
+   ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr);
+
+   /* vlan address was different from the real address and is equal to
 * the new address */
-   if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) &&
+   if ((vlandev->flags & IFF_UP) && !real_addr_in_use &&
ether_addr_equal(vlandev->dev_addr, dev->dev_addr))
dev_uc_del(dev, vlandev->dev_addr);
 
-   /* vlan address was equal to the old address and is different from
+   /* vlan address was equal to the real address so now also inherit
 * the new address */
-   if (ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) &&
-   !ether_addr_equal(vlandev->dev_addr, dev->dev_addr))
-   dev_uc_add(dev, vlandev->dev_addr);
+   if (real_addr_in_use)
+   ether_addr_copy(vlandev->dev_addr, dev->dev_addr);
 
ether_addr_copy(vlan->real_dev_addr, dev->dev_addr);
 }
@@ -389,13 +392,8 @@ static int vlan_device_event(struct noti
 
case NETDEV_CHANGEADDR:
/* Adjust unicast filters on underlying device */
-   vlan_group_for_each_dev(grp, i, vlandev) {
-   flgs = vlandev->flags;
-   if (!(flgs & IFF_UP))
-   continue;
-
+   vlan_group_for_each_dev(grp, i, vlandev)
vlan_sync_address(dev, vlandev);
-   }
break;
 
case NETDEV_CHANGEMTU:
-- 
1.7.10.4

[PATCH net] vlan: Propagate MAC address changes properly

2016-04-30 Thread Mike Manning

The MAC address of the physical interface is only copied to the VLAN
when it is first created, resulting in an inconsistency after MAC
address changes of only newly created VLANs having an up-to-date MAC.

Continuing to inherit the MAC address unless explicitly changed for
the VLAN allows IPv6 EUI64 addresses for the VLAN to reflect the change
and thus for DAD to behave as expected for the given MAC.

Signed-off-by: Mike Manning <mmann...@brocade.com>
---
 net/8021q/vlan.c |   17 ++---
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index d2cd9de..2f57cf2 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -293,15 +293,15 @@ static void vlan_sync_address(struct net_device *dev,
 
/* vlan address was different from the old address and is equal to
 * the new address */
-   if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) &&
+   if ((vlandev->flags & IFF_UP) &&
+   !ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) &&
ether_addr_equal(vlandev->dev_addr, dev->dev_addr))
dev_uc_del(dev, vlandev->dev_addr);
 
-   /* vlan address was equal to the old address and is different from
+   /* vlan address was equal to the old address so now also inherit
 * the new address */
-   if (ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) &&
-   !ether_addr_equal(vlandev->dev_addr, dev->dev_addr))
-   dev_uc_add(dev, vlandev->dev_addr);
+   if (ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr))
+   ether_addr_copy(vlandev->dev_addr, dev->dev_addr);
 
ether_addr_copy(vlan->real_dev_addr, dev->dev_addr);
 }
@@ -389,13 +389,8 @@ static int vlan_device_event(struct notifier_block 
*unused, unsigned long event,
 
case NETDEV_CHANGEADDR:
/* Adjust unicast filters on underlying device */
-   vlan_group_for_each_dev(grp, i, vlandev) {
-   flgs = vlandev->flags;
-   if (!(flgs & IFF_UP))
-   continue;
-
+   vlan_group_for_each_dev(grp, i, vlandev)
vlan_sync_address(dev, vlandev);
-   }
break;
 
case NETDEV_CHANGEMTU:
-- 
1.7.10.4

Re: [PATCH net] Propagate MAC address changes to VLANs

2016-04-30 Thread Mike Manning

On 03/03/2016 09:12 PM, David Miller wrote:
> From: Mike Manning <mmann...@brocade.com>
> Date: Mon, 29 Feb 2016 11:32:51 +
> 
>>  
>> -/* vlan address was equal to the old address and is different from
>> +/* vlan address was equal to the old address so now also inherit
>>   * the new address */
>> -if (ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) &&
>> -!ether_addr_equal(vlandev->dev_addr, dev->dev_addr))
>> -dev_uc_add(dev, vlandev->dev_addr);
>> +if (ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr))
>> +ether_addr_copy(vlandev->dev_addr, dev->dev_addr);
>>  
> 
> This dev_uc_add() call removal cannot be correct, if the device is up
> we must programe it into the hardware unicast filters and if also
> potentially put it into promiscuous mode via __dev_set_rx_mode().
> 

The call to dev_uc_add() to add a secondary address is only needed if the VLAN 
MAC is different from that for the physical interface. For the proposed 
changes, the VLAN MAC is tracking that of the physical interface and so is the 
same (as typically it does not make sense for these to be different), so 
dev_uc_add() should not be called. The easiest way to demonstrate equivalence 
with the original code, where the MAC address has to be set manually, is with 
some test debugs. Here, first the MAC of the interface itself is changed (so 
dev_uc_add() is called), then the MAC of the VLAN is changed (so dev_uc_del() 
is called):

1) ORIGINAL CODE:

ip addr show dev dp0s8 | grep ether
link/ether 52:54:00:1f:06:2a brd ff:ff:ff:ff:ff:ff
ip addr show dev dp0s8.40 | grep ether
link/ether 52:54:00:1f:06:2a brd ff:ff:ff:ff:ff:ff
sudo ip link set dp0s8.40 addr 10:20:30:40:50:61
sudo ip link set dp0s8 addr 10:20:30:40:50:61
ip addr show dev dp0s8 | grep ether
link/ether 10:20:30:40:50:61 brd ff:ff:ff:ff:ff:ff
ip addr show dev dp0s8.40 | grep ether
link/ether 10:20:30:40:50:61 brd ff:ff:ff:ff:ff:ff

[ 3990.332577] --- vlan_dev_set_mac_address: id 40, call dev_uc_add for 
10:20:30:40:50:61 on dp
0s8
[ 3990.332579] device dp0s8 entered promiscuous mode
[ 4002.425234] 8021q: --- vlan_sync_address: id 40, for 10:20:30:40:50:61 on 
dp0s8.40 (from dp0
s8)
[ 4002.425472] --- vlan_sync_address: id 40, call dev_uc_del for 
10:20:30:40:50:61 on dp0s8
[ 4002.425475] --- __hw_addr_del_entry: refcount 0 for 10:20:30:40:50:61
[ 4002.425477] device dp0s8 left promiscuous mode

sudo ip link set dp0s8 addr 52:54:00:1f:06:2a
ip addr show dev dp0s8.40 | grep ether
link/ether 10:20:30:40:50:61 brd ff:ff:ff:ff:ff:ff
sudo ip link set dp0s8.40 addr 52:54:00:1f:06:2a
ip addr show dev dp0s8 | grep ether
link/ether 52:54:00:1f:06:2a brd ff:ff:ff:ff:ff:ff
ip addr show dev dp0s8.40 | grep ether
link/ether 52:54:00:1f:06:2a brd ff:ff:ff:ff:ff:ff

[ 4121.606671] --- vlan_sync_address: id, 40, call dev_uc_add for 
10:20:30:40:50:61 on dp0s8
[ 4121.606673] device dp0s8 entered promiscuous mode
[ 4147.487780] --- vlan_dev_set_mac_address: id 40, for 52:54:00:1f:06:2a on 
dp0s8
[ 4147.487782] --- vlan_dev_set_mac_address: id 40, call dev_uc_del for 
10:20:30:40:50:61 dp0s8
[ 4147.487784] --- __hw_addr_del_entry: refcount 0 for 10:20:30:40:50:61
[ 4147.487786] device dp0s8 left promiscuous mode


2) WITH IMPROVEMENT FOR VLAN MAC TO FOLLOW THAT OF PHYSICAL INTF, UNLESS 
EXPLICITLY SET:

ip addr show dev dp0s8 | grep ether
link/ether 52:54:00:1f:06:2a brd ff:ff:ff:ff:ff:ff
ip addr show dev dp0s8.40 | grep ether
link/ether 52:54:00:1f:06:2a brd ff:ff:ff:ff:ff:ff
sudo ip link set dp0s8 addr 10:20:30:40:50:61
ip addr show dev dp0s8 | grep ether
link/ether 10:20:30:40:50:61 brd ff:ff:ff:ff:ff:ff
ip addr show dev dp0s8.40 | grep ether
link/ether 10:20:30:40:50:61 brd ff:ff:ff:ff:ff:ff

[  196.574789] 8021q: --- vlan_sync_address: id 40, for 10:20:30:40:50:61 on 
dp0s8.40 (from dp0
s8)
[  196.575004] --- vlan_sync_address: id 40, update to 10:20:30:40:50:61 on 
dp0s8.40 (from dp0s
8)

sudo ip link set dp0s8 addr 52:54:00:1f:06:2a
ip addr show dev dp0s8 | grep ether
link/ether 52:54:00:1f:06:2a brd ff:ff:ff:ff:ff:ff
ip addr show dev dp0s8.40 | grep ether
link/ether 52:54:00:1f:06:2a brd ff:ff:ff:ff:ff:ff

[  265.683313] 8021q: --- vlan_sync_address: id 40, for 52:54:00:1f:06:2a on 
dp0s8.40 (from dp0
s8)
[  265.683534] --- vlan_sync_address: id 40, update to 52:54:00:1f:06:2a on 
dp0s8.40 (from dp0s
8)

sudo ip link set dp0s8.40 addr 10:20:30:40:50:61
sudo ip link set dp0s8 addr 10:20:30:40:50:99
ip addr show dev dp0s8 | grep ether
link/ether 10:20:30:40:50:99 brd ff:ff:ff:ff:ff:ff
ip addr show dev dp0s8.40 | grep ether
link/ether 10:20:30:40:50:61 brd ff:ff:ff:ff:ff:ff
sudo ip link set dp0s8 addr 10:20:30:40:50:61

[ 5561.791222] --- vlan_dev_set_mac_address: id 40, for 10:20:30:40:50:61 on 
dp0s8
[ 5561.791225] --- vlan_dev_

1 2 >

1 - 100 of 109 matches

Mail list logo