Re: [next] bonding: pass link-local packets to bonding master also.

2018-11-30 Thread Vincent Bernat
 ❦ 15 juillet 2018 19:12 -0700, Mahesh Bandewar :

> Commit b89f04c61efe ("bonding: deliver link-local packets with
> skb->dev set to link that packets arrived on") changed the behavior
> of how link-local-multicast packets are processed. The change in
> the behavior broke some legacy use cases where these packets are
> expected to arrive on bonding master device also.

Unfortunately, this doesn't completely restore the previous
functionality as PACKET_ORIGDEV is broken for the copy: the original
interface is lost through the call to netif_rx(). A LLDP daemon
listening to the master interface won't get the original interface like
it was able to before 4.12.

I am a bit lost of what the original patch was trying to achieve. I am
using the following test program:

#v+
#!/usr/bin/env python3

import sys
import socket
import datetime

socket.SOL_PACKET = 263
socket.ETH_P_ALL = 3
socket.PACKET_ORIGDEV = 9

interface = sys.argv[1] if len(sys.argv) > 1 else 'lag1'

s = socket.socket(socket.AF_PACKET,
  socket.SOCK_RAW,
  socket.htons(socket.ETH_P_ALL))
s.bind((interface, 0))
s.setsockopt(socket.SOL_PACKET, socket.PACKET_ORIGDEV, 1)
while True:
data, addrinfo = s.recvfrom(1500)
if addrinfo[2] == socket.PACKET_OUTGOING:
continue
print(f"{datetime.datetime.now().isoformat()}: "
  f"Received {len(data)} bytes from {addrinfo}")
#v-

If I run it with a kernel compiled with the commit before b89f04c61efe
(plus a few more cherry-pick to make it work like ea8ffc0818d8 and
72ccc471e13b), I get:

#v+
2018-11-30T22:20:40.193378: Received 221 bytes from ('eth1', 35020, 2, 1, 
b'RT3\x00\x00\x02')
2018-11-30T22:20:40.194504: Received 221 bytes from ('eth0', 35020, 2, 1, 
b'RT3\x00\x00\x01')
#v-

If I send non link-local packets, I get:

#v+
2018-11-30T22:25:57.300965: Received 98 bytes from ('eth0', 2048, 0, 1, 
b'PT3\x00\x00\x02')
#v-

I am also able to correctly receive link-local packets directly on each
interface. So, it seems everything was working as expected before
b89f04c61efe.
-- 
AWAKE! FEAR! FIRE! FOES! AWAKE!
FEAR! FIRE! FOES!
AWAKE! AWAKE!
-- J. R. R. Tolkien


[PATCH net-next v1] net: don't declare IPv6 non-local bind helper if CONFIG_IPV6 undefined

2018-08-01 Thread Vincent Bernat
Fixes: 83ba4645152d ("net: add helpers checking if socket can be bound to 
nonlocal address")
Signed-off-by: Vincent Bernat 
---
 include/net/ipv6.h | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 82deb684ba73..ff33f498c137 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -766,13 +766,6 @@ static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo 
*np, struct flowi6 *fl6,
return hlimit;
 }
 
-static inline bool ipv6_can_nonlocal_bind(struct net *net,
- struct inet_sock *inet)
-{
-   return net->ipv6.sysctl.ip_nonlocal_bind ||
-   inet->freebind || inet->transparent;
-}
-
 /* copy IPv6 saddr & daddr to flow_keys, possibly using 64bit load/store
  * Equivalent to : flow->v6addrs.src = iph->saddr;
  * flow->v6addrs.dst = iph->daddr;
@@ -789,6 +782,13 @@ static inline void iph_to_flow_copy_v6addrs(struct 
flow_keys *flow,
 
 #if IS_ENABLED(CONFIG_IPV6)
 
+static inline bool ipv6_can_nonlocal_bind(struct net *net,
+ struct inet_sock *inet)
+{
+   return net->ipv6.sysctl.ip_nonlocal_bind ||
+   inet->freebind || inet->transparent;
+}
+
 /* Sysctl settings for net ipv6.auto_flowlabels */
 #define IP6_AUTO_FLOW_LABEL_OFF0
 #define IP6_AUTO_FLOW_LABEL_OPTOUT 1
-- 
2.18.0



[PATCH net-next v1] net: add helpers checking if socket can be bound to nonlocal address

2018-07-31 Thread Vincent Bernat
The construction "net->ipv4.sysctl_ip_nonlocal_bind || inet->freebind
|| inet->transparent" is present three times and its IPv6 counterpart
is also present three times. We introduce two small helpers to
characterize these tests uniformly.

Signed-off-by: Vincent Bernat 
---
 include/net/inet_sock.h | 8 
 include/net/ipv6.h  | 7 +++
 net/ipv4/af_inet.c  | 3 +--
 net/ipv4/ping.c | 6 ++
 net/ipv6/af_inet6.c | 6 ++
 net/ipv6/datagram.c | 3 +--
 6 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 314be484c696..e03b93360f33 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -359,4 +359,12 @@ static inline bool inet_get_convert_csum(struct sock *sk)
return !!inet_sk(sk)->convert_csum;
 }
 
+
+static inline bool inet_can_nonlocal_bind(struct net *net,
+ struct inet_sock *inet)
+{
+   return net->ipv4.sysctl_ip_nonlocal_bind ||
+   inet->freebind || inet->transparent;
+}
+
 #endif /* _INET_SOCK_H */
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index a44509f4e985..82deb684ba73 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -766,6 +766,13 @@ static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo 
*np, struct flowi6 *fl6,
return hlimit;
 }
 
+static inline bool ipv6_can_nonlocal_bind(struct net *net,
+ struct inet_sock *inet)
+{
+   return net->ipv6.sysctl.ip_nonlocal_bind ||
+   inet->freebind || inet->transparent;
+}
+
 /* copy IPv6 saddr & daddr to flow_keys, possibly using 64bit load/store
  * Equivalent to : flow->v6addrs.src = iph->saddr;
  * flow->v6addrs.dst = iph->daddr;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index f2a0a3bab6b5..ee707b91d1a7 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -486,8 +486,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, 
int addr_len,
 *  is temporarily down)
 */
err = -EADDRNOTAVAIL;
-   if (!net->ipv4.sysctl_ip_nonlocal_bind &&
-   !(inet->freebind || inet->transparent) &&
+   if (!inet_can_nonlocal_bind(net, inet) &&
addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
chk_addr_ret != RTN_LOCAL &&
chk_addr_ret != RTN_MULTICAST &&
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index b54c964ad925..8d7aaf118a30 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -320,8 +320,7 @@ static int ping_check_bind_addr(struct sock *sk, struct 
inet_sock *isk,
if (addr->sin_addr.s_addr == htonl(INADDR_ANY))
chk_addr_ret = RTN_LOCAL;
 
-   if ((net->ipv4.sysctl_ip_nonlocal_bind == 0 &&
-   isk->freebind == 0 && isk->transparent == 0 &&
+   if ((!inet_can_nonlocal_bind(net, isk) &&
 chk_addr_ret != RTN_LOCAL) ||
chk_addr_ret == RTN_MULTICAST ||
chk_addr_ret == RTN_BROADCAST)
@@ -361,8 +360,7 @@ static int ping_check_bind_addr(struct sock *sk, struct 
inet_sock *isk,
scoped);
rcu_read_unlock();
 
-   if (!(net->ipv6.sysctl.ip_nonlocal_bind ||
- isk->freebind || isk->transparent || has_addr ||
+   if (!(ipv6_can_nonlocal_bind(net, isk) || has_addr ||
  addr_type == IPV6_ADDR_ANY))
return -EADDRNOTAVAIL;
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index c9535354149f..020f6e14a7af 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -322,8 +322,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr 
*uaddr, int addr_len,
/* Reproduce AF_INET checks to make the bindings consistent */
v4addr = addr->sin6_addr.s6_addr32[3];
chk_addr_ret = inet_addr_type(net, v4addr);
-   if (!net->ipv4.sysctl_ip_nonlocal_bind &&
-   !(inet->freebind || inet->transparent) &&
+   if (!inet_can_nonlocal_bind(net, inet) &&
v4addr != htonl(INADDR_ANY) &&
chk_addr_ret != RTN_LOCAL &&
chk_addr_ret != RTN_MULTICAST &&
@@ -362,8 +361,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr 
*uaddr, int addr_len,
 */
v4addr = LOOPBACK4_IPV6;
if (!(addr_type & IPV6_ADDR_MULTICAST)) {
-   if (!net->ipv6.sysctl.ip_nonlocal_bind &&
-   !(inet->freebind || inet->transparent) 

Re: [net-next v1] net/ipv6: allow any source address for sendmsg pktinfo with ip_nonlocal_bind

2018-07-30 Thread Vincent Bernat
 ❦ 29 juillet 2018 12:28 -0700, David Miller  :

>> When freebind feature is set of an IPv6 socket, any source address can
>> be used when sending UDP datagrams using IPv6 PKTINFO ancillary
>> message. Global non-local bind feature was added in commit
>> 35a256fee52c ("ipv6: Nonlocal bind") for IPv6. This commit also allows
>> IPv6 source address spoofing when non-local bind feature is enabled.
>> 
>> Signed-off-by: Vincent Bernat 
>
> This definitely seems to make sense.  And is consistent with the other
> tests involving freebind and transparent.
>
> This test involving ip_nonlocal_bind, freeebind, and transparent happens
> in several locations.  Perhaps we should add a helper function for
> this?

Yes, I can do that. Should I also include one for SCTP?
-- 
"Elves and Dragons!" I says to him.  "Cabbages and potatoes are better
for you and me."
-- J. R. R. Tolkien


[net-next v1] net/ipv6: allow any source address for sendmsg pktinfo with ip_nonlocal_bind

2018-07-25 Thread Vincent Bernat
When freebind feature is set of an IPv6 socket, any source address can
be used when sending UDP datagrams using IPv6 PKTINFO ancillary
message. Global non-local bind feature was added in commit
35a256fee52c ("ipv6: Nonlocal bind") for IPv6. This commit also allows
IPv6 source address spoofing when non-local bind feature is enabled.

Signed-off-by: Vincent Bernat 
---
 net/ipv6/datagram.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 201306b9b5ea..c46936563b15 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -800,7 +800,8 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
 
if (addr_type != IPV6_ADDR_ANY) {
int strict = __ipv6_addr_src_scope(addr_type) 
<= IPV6_ADDR_SCOPE_LINKLOCAL;
-   if (!(inet_sk(sk)->freebind || 
inet_sk(sk)->transparent) &&
+   if (!(net->ipv6.sysctl.ip_nonlocal_bind ||
+ inet_sk(sk)->freebind || 
inet_sk(sk)->transparent) &&
!ipv6_chk_addr_and_flags(net, 
_info->ipi6_addr,
 dev, !strict, 0,
 IFA_F_TENTATIVE) &&
-- 
2.18.0



Re: [PATCH iproute2-next] ipaddress: fix label matching

2018-07-14 Thread Vincent Bernat
 ❦ 14 juillet 2018 21:54 +0300, Serhey Popovych  :

> We should leave only filter.label check and return 0:
>
> if (filter.label)
> return 0;
>
> This will ensure we exit from print_linkinfo() earlier, skip
> print_link_stats() and push final filtering by label to
> print_selected_addrinfo() and print_addrinfo().
>
> And finally: this is regression and should be against iproute2, not -next.

As you did the analysis, I let you do the patch.
-- 
Watch out for off-by-one errors.
- The Elements of Programming Style (Kernighan & Plauger)


Re: [PATCH iproute2-next] ipaddress: fix label matching

2018-07-11 Thread Vincent Bernat
 ❦ 11 juillet 2018 21:01 -0400, David Ahern  :

>> +++ b/ip/ipaddress.c
>> @@ -837,11 +837,6 @@ int print_linkinfo(const struct sockaddr_nl *who,
>>  if (!name)
>>  return -1;
>>  
>> -if (filter.label &&
>> -(!filter.family || filter.family == AF_PACKET) &&
>> -fnmatch(filter.label, name, 0))
>> -return -1;
>> -
>
> The offending commit changed the return code:
>
> if (filter.label &&
> (!filter.family || filter.family == AF_PACKET) &&
> -   fnmatch(filter.label, RTA_DATA(tb[IFLA_IFNAME]), 0))
> -   return 0;
> +   fnmatch(filter.label, name, 0))
> +   return -1;
>
>
> Vincent: can you try leaving the code as is, but change the return to 0?

Yes, it works by just returning 0. The code still doesn't make sense.
-- 
Many pages make a thick book, except for pocket Bibles which are on very
very thin paper.


Re: [PATCH iproute2-next] ipaddress: fix label matching

2018-07-11 Thread Vincent Bernat
 ❦ 11 juillet 2018 13:03 -0700, Stephen Hemminger  :

>> Since 9516823051ce, "ip addr show label lo:1" doesn't work
>> anymore (doesn't show any address, despite a matching label).
>> Reverting to return 0 instead of -1 fix the issue.
>> 
>> However, the condition says: "if we filter by label [...] and the
>> label does NOT match the interface name". This makes little sense to
>> compare the label with the interface name. There is also a logic
>> around filter family being provided or not. The match against the
>> label is done by ifa_label_match_rta() in print_addrinfo() and
>> ipaddr_filter().
>> 
>> Just removing the condition makes "ip addr show" works as expected
>> with or without specifying a label, both when the label is matching
>> and not matching. It also works if we specify a label and the label is
>> the interface name. The flush operation also works as expected.
>> 
>> Fixes: 9516823051ce ("ipaddress: Improve print_linkinfo()")
>> Signed-off-by: Vincent Bernat 
>> ---
>>  ip/ipaddress.c | 5 -
>>  1 file changed, 5 deletions(-)
>> 
>> diff --git a/ip/ipaddress.c b/ip/ipaddress.c
>> index 5009bfe6d2e3..20ef6724944e 100644
>> --- a/ip/ipaddress.c
>> +++ b/ip/ipaddress.c
>> @@ -837,11 +837,6 @@ int print_linkinfo(const struct sockaddr_nl *who,
>>  if (!name)
>>  return -1;
>>  
>> -if (filter.label &&
>> -(!filter.family || filter.family == AF_PACKET) &&
>> -fnmatch(filter.label, name, 0))
>> -return -1;
>> -
>>  if (tb[IFLA_GROUP]) {
>>  int group = rta_getattr_u32(tb[IFLA_GROUP]);
>> 
>
> If this is a regression, it should go to iproute2 not iproute2-next.
>
> Surprised by the solution since it is removing code that was there
> before the commit you referenced in Fixes.

Yes, but as I explain in the commit message, the condition does not make
sense for me: why would we match the label against the interface name?
This code exists since a long time.
-- 
The lunatic, the lover, and the poet,
Are of imagination all compact...
-- Wm. Shakespeare, "A Midsummer Night's Dream"


[PATCH iproute2-next] ipaddress: fix label matching

2018-07-11 Thread Vincent Bernat
Since 9516823051ce, "ip addr show label lo:1" doesn't work
anymore (doesn't show any address, despite a matching label).
Reverting to return 0 instead of -1 fix the issue.

However, the condition says: "if we filter by label [...] and the
label does NOT match the interface name". This makes little sense to
compare the label with the interface name. There is also a logic
around filter family being provided or not. The match against the
label is done by ifa_label_match_rta() in print_addrinfo() and
ipaddr_filter().

Just removing the condition makes "ip addr show" works as expected
with or without specifying a label, both when the label is matching
and not matching. It also works if we specify a label and the label is
the interface name. The flush operation also works as expected.

Fixes: 9516823051ce ("ipaddress: Improve print_linkinfo()")
Signed-off-by: Vincent Bernat 
---
 ip/ipaddress.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 5009bfe6d2e3..20ef6724944e 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -837,11 +837,6 @@ int print_linkinfo(const struct sockaddr_nl *who,
if (!name)
return -1;
 
-   if (filter.label &&
-   (!filter.family || filter.family == AF_PACKET) &&
-   fnmatch(filter.label, name, 0))
-   return -1;
-
if (tb[IFLA_GROUP]) {
int group = rta_getattr_u32(tb[IFLA_GROUP]);
 
-- 
2.18.0



[PATCH net-next v1] netfilter: provide input interface for route lookup for rpfilter

2018-05-20 Thread Vincent Bernat
In commit 47b7e7f82802, this bit was removed at the same time the
RT6_LOOKUP_F_IFACE flag was removed. However, it is needed when
link-local addresses are used, which is a very common case: when
packets are routed, neighbor solicitations are done using link-local
addresses. For example, the following neighbor solicitation is not
matched by "-m rpfilter":

IP6 fe80::5254:33ff:fe00:1 > ff02::1:ff00:3: ICMP6, neighbor
solicitation, who has 2001:db8::5254:33ff:fe00:3, length 32

Commit 47b7e7f82802 doesn't quite explain why we shouldn't use
RT6_LOOKUP_F_IFACE in the rpfilter case. I suppose the interface check
later in the function would make it redundant. However, the remaining
of the routing code is using RT6_LOOKUP_F_IFACE when there is no
source address (which matches rpfilter's case with a non-unicast
destination, like with neighbor solicitation).

Signed-off-by: Vincent Bernat <vinc...@bernat.im>
Fixes: 47b7e7f82802 ("netfilter: don't set F_IFACE on ipv6 fib lookups")
---
 net/ipv6/netfilter/ip6t_rpfilter.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c 
b/net/ipv6/netfilter/ip6t_rpfilter.c
index d12f511929f5..0fe61ede77c6 100644
--- a/net/ipv6/netfilter/ip6t_rpfilter.c
+++ b/net/ipv6/netfilter/ip6t_rpfilter.c
@@ -48,6 +48,8 @@ static bool rpfilter_lookup_reverse6(struct net *net, const 
struct sk_buff *skb,
}
 
fl6.flowi6_mark = flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
+   if ((flags & XT_RPFILTER_LOOSE) == 0)
+   fl6.flowi6_oif = dev->ifindex;
 
rt = (void *)ip6_route_lookup(net, , skb, lookup_flags);
if (rt->dst.error)
-- 
2.17.0



Re: [RFC v2 bpf-next 5/9] net/ipv6: Add fib6_lookup

2018-05-01 Thread Vincent Bernat
 ❦ 29 avril 2018 11:07 -0700, David Ahern  :

> +struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
> +   int flags)

Maybe an EXPORT_SYMBOL_GPL? There is one for __fib_lookup (fib_lookup is
an inline function).
-- 
Use the "telephone test" for readability.
- The Elements of Programming Style (Kernighan & Plauger)


Re: [PATCH net-next v1] ipvs: add consistent source hashing scheduling

2018-04-02 Thread Vincent Bernat
 ❦  2 avril 2018 22:05 +0300, Julian Anastasov  :

>   Sorry to say it but may be you missed the discussion
> on lvs-devel about the new MH scheduler implemented by Inju Song:
>
> https://www.spinics.net/lists/lvs-devel/msg04928.html
> http://archive.linuxvirtualserver.org/html/lvs-devel/2018-03/msg00023.html
>
>   In the last 6 months we fixed all issues and I acked
> v4 just yesterday:
>
> http://archive.linuxvirtualserver.org/html/lvs-devel/2018-04/msg3.html

For some reason, "ipvs maglev" in Google doesn't yield those
results. But having code already reviewed is great news for me: I can
use it with confidence. :)
-- 
Make sure your code "does nothing" gracefully.
- The Elements of Programming Style (Kernighan & Plauger)


Re: [PATCH net-next v1] ipvs: add consistent source hashing scheduling

2018-04-02 Thread Vincent Bernat
 ❦  2 avril 2018 10:33 -0700, Eric Dumazet  :

>> +static inline u32
>> +ip_vs_csh_permutation(struct ip_vs_dest *d, int j)
>> +{
>> +u32 offset, skip;
>> +__be32 addr_fold = d->addr.ip;
>> +
>> +#ifdef CONFIG_IP_VS_IPV6
>> +if (d->af == AF_INET6)
>> +addr_fold = d->addr.ip6[0]^d->addr.ip6[1]^
>> +d->addr.ip6[2]^d->addr.ip6[3];
>> +#endif
>> +addr_fold = ntohl(addr_fold) + ntohs(d->port);
>> +offset = hash_32(addr_fold, 32) % IP_VS_CSH_TAB_SIZE;
>> +skip = (hash_32(addr_fold + 1, 32) % (IP_VS_CSH_TAB_SIZE - 1)) + 1;
>> +return (offset + j * skip) % IP_VS_CSH_TAB_SIZE;
>> +}
>> +
>
> This does not look very strong to me, particularly the IPv6 folding
>
> I would rather use __ipv6_addr_jhash() instead of ipv6_addr_hash(),
> even if it is hard coded ;)

I can switch to ipv6_addr_hash(). However, switching to
__ipv6_addr_jhash seems useless as I would need to hardcode the initial
value: people use source hashing to get the same result from one host to
another. Am I missing something?
-- 
Each module should do one thing well.
- The Elements of Programming Style (Kernighan & Plauger)


[PATCH net-next v1] ipvs: add consistent source hashing scheduling

2018-04-02 Thread Vincent Bernat
Based on Google's Maglev algorithm [1][2], this scheduler builds a
lookup table in a way disruption is minimized when a change
occurs. This helps in case of active/active setup without
synchronization. Like for classic source hashing, this lookup table is
used to assign connections to a real server.

Both source address and port are used to compute the hash (unlike sh
where this is optional).

Weights are correctly handled. Unlike sh, servers with a weight of 0
are considered as absent. Also, unlike sh, when a server becomes
unavailable due to a threshold, no fallback is possible: doing so
would seriously impair the the usefulness of using a consistent hash.

There is a small hack to detect when all real servers have a weight of
0. It relies on the fact it is not possible for the weight of a real
server to change during the execution of the assignment. I believe
this is the case as modifications through netlink are subject to a
mutex, but the use of atomic_read() is unsettling.

The value of 65537 for the hash table size is currently not modifiable
at compile-time. This is the value suggested in the Maglev
paper. Another possible value is 257 (for small tests) and 655373 (for
very large setups).

[1]: https://research.google.com/pubs/pub44824.html
[2]: 
https://blog.acolyer.org/2016/03/21/maglev-a-fast-and-reliable-software-network-load-balancer/

Signed-off-by: Vincent Bernat <vinc...@bernat.im>
---
 include/net/ip_vs.h|  27 
 net/netfilter/ipvs/Kconfig |  13 ++
 net/netfilter/ipvs/Makefile|   1 +
 net/netfilter/ipvs/ip_vs_csh.c | 339 +
 net/netfilter/ipvs/ip_vs_sh.c  |  32 +---
 5 files changed, 381 insertions(+), 31 deletions(-)
 create mode 100644 net/netfilter/ipvs/ip_vs_csh.c

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index eb0bec043c96..2184b43b7320 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -195,6 +195,33 @@ static inline int ip_vs_addr_equal(int af, const union 
nf_inet_addr *a,
return a->ip == b->ip;
 }
 
+static inline __be16 ip_vs_get_port(const struct sk_buff *skb, struct 
ip_vs_iphdr *iph)
+{
+   __be16 _ports[2], *ports;
+
+   /* At this point we know that we have a valid packet of some kind.
+* Because ICMP packets are only guaranteed to have the first 8
+* bytes, let's just grab the ports.  Fortunately they're in the
+* same position for all three of the protocols we care about.
+*/
+   switch (iph->protocol) {
+   case IPPROTO_TCP:
+   case IPPROTO_UDP:
+   case IPPROTO_SCTP:
+   ports = skb_header_pointer(skb, iph->len, sizeof(_ports),
+  &_ports);
+   if (unlikely(!ports))
+   return 0;
+
+   if (likely(!ip_vs_iph_inverse(iph)))
+   return ports[0];
+   else
+   return ports[1];
+   default:
+   return 0;
+   }
+}
+
 #ifdef CONFIG_IP_VS_DEBUG
 #include 
 
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index b32fb0dbe237..bfd091e020af 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -225,6 +225,19 @@ config IP_VS_SH
  If you want to compile it in kernel, say Y. To compile it as a
  module, choose M here. If unsure, say N.
 
+config IP_VS_CSH
+   tristate "consistent source hashing scheduling"
+   ---help---
+ The consistent source hashing scheduling algorithm assigns
+ network connections to the servers through looking up a
+ statically assigned hash table by their source IP addresses
+ and ports. The hash table is built to minimize disruption
+ when a server becomes unavailable. It relies on the Maglev
+ algorithm.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
 config IP_VS_SED
tristate "shortest expected delay scheduling"
---help---
diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile
index c552993fa4b9..7d0badf86dfe 100644
--- a/net/netfilter/ipvs/Makefile
+++ b/net/netfilter/ipvs/Makefile
@@ -33,6 +33,7 @@ obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
 obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
 obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
 obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
+obj-$(CONFIG_IP_VS_CSH) += ip_vs_csh.o
 obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
 obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
 
diff --git a/net/netfilter/ipvs/ip_vs_csh.c b/net/netfilter/ipvs/ip_vs_csh.c
new file mode 100644
index ..c70db61ba934
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_csh.c
@@ -0,0 +1,339 @@
+/*
+ * IPVS:Consistent Hashing scheduling module using Google's Maglev
+ *
+ * Authors: Vincent Bernat <vinc...@bernat.im>
+ *
+ *  This program is free software; you can redistr

[PATCH net-next v2] ipvs: fix multiplicative hashing in sh/dh/lblc/lblcr algorithms

2018-04-01 Thread Vincent Bernat
The sh/dh/lblc/lblcr algorithms are using Knuth's multiplicative
hashing incorrectly. Replace its use by the hash_32() macro, which
correctly implements this algorithm. It doesn't use the same constant,
but it shouldn't matter.

Signed-off-by: Vincent Bernat <vinc...@bernat.im>
---
 net/netfilter/ipvs/ip_vs_dh.c| 3 ++-
 net/netfilter/ipvs/ip_vs_lblc.c  | 3 ++-
 net/netfilter/ipvs/ip_vs_lblcr.c | 3 ++-
 net/netfilter/ipvs/ip_vs_sh.c| 3 ++-
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c
index 75f798f8e83b..dfea31fb10c5 100644
--- a/net/netfilter/ipvs/ip_vs_dh.c
+++ b/net/netfilter/ipvs/ip_vs_dh.c
@@ -43,6 +43,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -81,7 +82,7 @@ static inline unsigned int ip_vs_dh_hashkey(int af, const 
union nf_inet_addr *ad
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
 #endif
-   return (ntohl(addr_fold)*2654435761UL) & IP_VS_DH_TAB_MASK;
+   return hash_32(ntohl(addr_fold),  IP_VS_DH_TAB_BITS);
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index 3057e453bf31..08147fc6400c 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -48,6 +48,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* for sysctl */
 #include 
@@ -160,7 +161,7 @@ ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr)
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
 #endif
-   return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
+   return hash_32(ntohl(addr_fold), IP_VS_LBLC_TAB_BITS);
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 92adc04557ed..9b6a6c9e9cfa 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -47,6 +47,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* for sysctl */
 #include 
@@ -323,7 +324,7 @@ ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr)
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
 #endif
-   return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
+   return hash_32(ntohl(addr_fold), IP_VS_LBLCR_TAB_BITS);
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index 16aaac6eedc9..1e01c782583a 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -96,7 +96,8 @@ ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr,
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
 #endif
-   return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) &
+   return (offset + hash_32(ntohs(port) + ntohl(addr_fold),
+IP_VS_SH_TAB_BITS)) &
IP_VS_SH_TAB_MASK;
 }
 
-- 
2.16.3



Re: [PATCH net-next v1] ipvs: fix multiplicative hashing in sh/dh/lblc/lblcr algorithms

2018-04-01 Thread Vincent Bernat
 ❦  1 avril 2018 11:11 +0300, Julian Anastasov  :

>> -return (ntohl(addr_fold)*2654435761UL) & IP_VS_DH_TAB_MASK;
>> +return ((ntohl(addr_fold)*2654435761U) >>
>> +(32 - IP_VS_DH_TAB_BITS)) &
>> +IP_VS_DH_TAB_MASK;
>
>   Looks like the '& mask' part is not needed, still,
> it does not generate extra code. I see that other code uses
> hash_32(val, bits) from include/linux/hash.h but note that it
> used different ratio before Linux 4.7, in case someone backports
> this patch on old kernels. So, I don't have preference what should
> be used, may be return hash_32(ntohl(addr_fold), IP_VS_DH_TAB_BITS)
> is better.

I didn't notice this macro. I think this is a better option. Let me
amend the patch.
-- 
Don't stop with your first draft.
- The Elements of Programming Style (Kernighan & Plauger)


[PATCH net-next v1] ipvs: fix multiplicative hashing in sh/dh/lblc/lblcr algorithms

2018-03-31 Thread Vincent Bernat
The sh/dh/lblc/lblcr algorithms are using Knuth's multiplicative
hashing incorrectly. This results in uneven distribution.

To fix this, the result has to be shifted by a constant. In "Lecture
21: Hash functions" [1], it is said:

   In the fixed-point version, The division by 2^q is crucial. The
   common mistake when doing multiplicative hashing is to forget to do
   it, and in fact you can find web pages highly ranked by Google that
   explain multiplicative hashing without this step. Without this
   division, there is little point to multiplying by a, because ka mod
   m = (k mod m) * (a mod m) mod m . This is no better than modular
   hashing with a modulus of m, and quite possibly worse.

Typing the 2654435761 constant in DuckDuckGo shows many other sources
to confirm this issue. Moreover, doing the multiplication in the 32bit
integer space is enough, hence the change from 2654435761UL to
2654435761U.

[1]: https://www.cs.cornell.edu/courses/cs3110/2008fa/lectures/lec21.html

The following Python program illustrates the bug and its fix:

import netaddr
import collections
import socket
import statistics

def run(buggy=False):
base = netaddr.IPAddress('203.0.113.0')
count = collections.defaultdict(int)
for offset in range(100):
for port in range(1, 11000):
r = socket.ntohs(port) + socket.ntohl(int(base) + offset)
r *= 2654435761
if buggy:
r %= 1 << 64
else:
r %= 1 << 32
r >>= 24
r &= 255
count[r] += 1

print(buggy,
  statistics.mean(count.values()),
  statistics.stdev(count.values()))

run(True)
run(False)

Its output is:

True 25000 765.9416862050705
False 390.625 4.681209831891333

Signed-off-by: Vincent Bernat <vinc...@bernat.im>
---
 net/netfilter/ipvs/ip_vs_dh.c| 4 +++-
 net/netfilter/ipvs/ip_vs_lblc.c  | 4 +++-
 net/netfilter/ipvs/ip_vs_lblcr.c | 4 +++-
 net/netfilter/ipvs/ip_vs_sh.c| 3 ++-
 4 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c
index 75f798f8e83b..5638e66dbdd1 100644
--- a/net/netfilter/ipvs/ip_vs_dh.c
+++ b/net/netfilter/ipvs/ip_vs_dh.c
@@ -81,7 +81,9 @@ static inline unsigned int ip_vs_dh_hashkey(int af, const 
union nf_inet_addr *ad
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
 #endif
-   return (ntohl(addr_fold)*2654435761UL) & IP_VS_DH_TAB_MASK;
+   return ((ntohl(addr_fold)*2654435761U) >>
+   (32 - IP_VS_DH_TAB_BITS)) &
+   IP_VS_DH_TAB_MASK;
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index 3057e453bf31..df32022a2bc4 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -160,7 +160,9 @@ ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr)
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
 #endif
-   return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
+   return ((ntohl(addr_fold)*2654435761U) >>
+   (32 - IP_VS_LBLC_TAB_BITS)) &
+   IP_VS_LBLC_TAB_MASK;
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 92adc04557ed..3d0d278d4901 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -323,7 +323,9 @@ ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr)
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
 #endif
-   return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
+   return ((ntohl(addr_fold)*2654435761U) >>
+   (32 - IP_VS_LBLCR_TAB_BITS)) &
+   IP_VS_LBLCR_TAB_MASK;
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index 16aaac6eedc9..d2d6cdfae86e 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -96,7 +96,8 @@ ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr,
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
 #endif
-   return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) &
+   return ((offset + (ntohs(port) + ntohl(addr_fold))*2654435761U) >>
+   (32 - IP_VS_SH_TAB_BITS)) &
IP_VS_SH_TAB_MASK;
 }
 
-- 
2.16.3



Re: [PATCH iproute2-next] color: disable color when json output is requested

2018-02-20 Thread Vincent Bernat
 ❦ 20 février 2018 16:04 -0800, Stephen Hemminger <step...@networkplumber.org> :

>> Instead of declaring -color and -json exclusive, ignore -color when
>> -json is provided. The rationale is to allow to put -color in an alias
>> for ip while still being able to use -json. -color is merely a
>> presentation suggestion and we can assume there is nothing to color in
>> the JSON output.
>> 
>> Signed-off-by: Vincent Bernat <vinc...@bernat.im>
>
> Looks fine to me, this could even go into master.
> Need to update man page and make sure behavior is consistent
> across ip, tc, and bridge commands.

Currently, in master or net-next, only the "ip" command has a -color
option.
-- 
The human race is a race of cowards; and I am not only marching in that
procession but carrying a banner.
-- Mark Twain


[PATCH iproute2-next] color: disable color when json output is requested

2018-02-20 Thread Vincent Bernat
Instead of declaring -color and -json exclusive, ignore -color when
-json is provided. The rationale is to allow to put -color in an alias
for ip while still being able to use -json. -color is merely a
presentation suggestion and we can assume there is nothing to color in
the JSON output.

Signed-off-by: Vincent Bernat <vinc...@bernat.im>
---
 include/color.h | 1 -
 ip/ip.c | 7 ---
 lib/color.c | 8 
 3 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/include/color.h b/include/color.h
index f6c351b77746..c80359d3e2e9 100644
--- a/include/color.h
+++ b/include/color.h
@@ -13,7 +13,6 @@ enum color_attr {
 };
 
 void enable_color(void);
-void check_if_color_enabled(void);
 void set_color_palette(void);
 int color_fprintf(FILE *fp, enum color_attr attr, const char *fmt, ...);
 enum color_attr ifa_family_color(__u8 ifa_family);
diff --git a/ip/ip.c b/ip/ip.c
index b15e6b66b3f6..ebf77aa2ee37 100644
--- a/ip/ip.c
+++ b/ip/ip.c
@@ -172,6 +172,7 @@ int main(int argc, char **argv)
 {
char *basename;
char *batch_file = NULL;
+   int color = 0;
 
basename = strrchr(argv[0], '/');
if (basename == NULL)
@@ -273,7 +274,7 @@ int main(int argc, char **argv)
}
rcvbuf = size;
} else if (matches(opt, "-color") == 0) {
-   enable_color();
+   ++color;
} else if (matches(opt, "-help") == 0) {
usage();
} else if (matches(opt, "-netns") == 0) {
@@ -293,8 +294,8 @@ int main(int argc, char **argv)
 
_SL_ = oneline ? "\\" : "\n";
 
-   if (json)
-   check_if_color_enabled();
+   if (color && !json)
+   enable_color();
 
if (batch_file)
return batch(batch_file);
diff --git a/lib/color.c b/lib/color.c
index a13a4930b10c..da1f516cb249 100644
--- a/lib/color.c
+++ b/lib/color.c
@@ -92,14 +92,6 @@ void set_color_palette(void)
is_dark_bg = 1;
 }
 
-void check_if_color_enabled(void)
-{
-   if (color_is_enabled) {
-   fprintf(stderr, "Option \"-json\" conflicts with 
\"-color\".\n");
-   exit(1);
-   }
-}
-
 int color_fprintf(FILE *fp, enum color_attr attr, const char *fmt, ...)
 {
int ret = 0;
-- 
2.16.1



Re: [PATCH] net: bridge: add max_fdb_count

2017-11-16 Thread Vincent Bernat
 ❦ 16 novembre 2017 20:23 +0100, Andrew Lunn  :

> struct net_bridge_fdb_entry is 40 bytes.
>
> My WiFi access point which is also a 5 port bridge, currently has 97MB
> free RAM. That is space for about 2.5M FDB entries. So even Roopa's
> 128K is not really a problem, in terms of memory.

I am also interested in Sarah's patch because we can now have bridge
with many ports through VXLAN. The FDB can be replicated to an external
daemon with BGP and the cost of each additional MAC address is therefore
higher than just a few bytes. It seems simpler to implement a limiting
policy early (at the port or bridge level).

Also, this is a pretty standard limit to have for a bridge (switchport
port-security maximum on Cisco, set interface X mac-limit on
Juniper). And it's not something easy to do with ebtables.
-- 
Use the good features of a language; avoid the bad ones.
- The Elements of Programming Style (Kernighan & Plauger)


Re: [PATCH net] vxlan: fix the issue that neigh proxy blocks all icmpv6 packets

2017-11-11 Thread Vincent Bernat
 ❦ 11 novembre 2017 19:58 +0800, Xin Long <lucien@gmail.com> :

> Commit f1fb08f6337c ("vxlan: fix ND proxy when skb doesn't have transport
> header offset") removed icmp6_code and icmp6_type check before calling
> neigh_reduce when doing neigh proxy.
>
> It means all icmpv6 packets would be blocked by this, not only ns packet.
> In Jianlin's env, even ping6 couldn't work through it.
>
> This patch is to bring the icmp6_code and icmp6_type check back and also
> removed the same check from neigh_reduce().

I am very sorry for not having spotted this bug earlier. I have tested
your fix and I can confirm it works as expected.

> Fixes: f1fb08f6337c ("vxlan: fix ND proxy when skb doesn't have transport 
> header offset")
> Reported-by: Jianlin Shi <ji...@redhat.com>
> Signed-off-by: Xin Long <lucien@gmail.com>
Reviewed-by: Vincent Bernat <vinc...@bernat.im>
-- 
Don't just echo the code with comments - make every comment count.
- The Elements of Programming Style (Kernighan & Plauger)


Re: [net-next v3] bridge: trigger RTM_NEWLINK when interface is modified by bridge ioctl

2017-09-21 Thread Vincent Bernat
 ❦ 21 septembre 2017 08:15 -0700, Stephen Hemminger 
<step...@networkplumber.org> :

>> Currently, there is a difference in netlink events received when an
>> interface is modified through bridge ioctl() or through netlink. This
>> patch generates additional events when an interface is added to or
>> removed from a bridge via ioctl().
>> 
>> When adding then removing an interface from a bridge with netlink, we
>> get:
>> 
>> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master 
>> bridge0 state UNKNOWN group default
>> link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff
>> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state 
>> UNKNOWN
>> link/ether 9e:da:60:ee:cf:c8
>> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state 
>> UNKNOWN
>> link/ether 9e:da:60:ee:cf:c8
>> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state 
>> UNKNOWN
>> link/ether 9e:da:60:ee:cf:c8
>> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state 
>> UNKNOWN
>> link/ether 9e:da:60:ee:cf:c8
>> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master 
>> bridge0 state UNKNOWN group default
>> link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff
>> 
>> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master 
>> bridge0 state UNKNOWN group default
>> link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff
>> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state 
>> UNKNOWN
>> link/ether 9e:da:60:ee:cf:c8
>> Deleted 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 
>> state UNKNOWN
>> link/ether 9e:da:60:ee:cf:c8
>> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state 
>> UNKNOWN group default
>> link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff
>> 
>> When using ioctl():
>> 
>> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master 
>> bridge0 state UNKNOWN group default
>> link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff
>> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state 
>> UNKNOWN
>> link/ether 9e:da:60:ee:cf:c8
>> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state 
>> UNKNOWN
>> link/ether 9e:da:60:ee:cf:c8
>> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state 
>> UNKNOWN
>> link/ether 9e:da:60:ee:cf:c8
>> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master 
>> bridge0 state UNKNOWN group default
>> link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff
>> 
>> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master 
>> bridge0 state UNKNOWN group default
>> link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff
>> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state 
>> UNKNOWN
>> link/ether 9e:da:60:ee:cf:c8
>> Deleted 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 
>> state UNKNOWN
>> link/ether 9e:da:60:ee:cf:c8
>> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state 
>> UNKNOWN group default
>> link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff
>> 
>> Without this patch, the last netlink notification is not sent.
>> 
>> Signed-off-by: Vincent Bernat <vinc...@bernat.im>
>
> This makes sense, you should probably add a Fixes: tag to help maintainers
> of long term stable kernels.
>
> Reviewed-by: Stephen Hemminger <step...@networkplumber.org>

I wouldn't know which commit would be fixed since this is not a
regression, just a behavior difference.
-- 
Make sure special cases are truly special.
- The Elements of Programming Style (Kernighan & Plauger)


Re: [PATCH net-next v2] bridge: also trigger RTM_NEWLINK when interface is released from bridge

2017-09-21 Thread Vincent Bernat
 ❦ 21 septembre 2017 08:09 -0700, Roopa Prabhu  :

>>> The one concern is that ports added or removed through ioctl should
>>> cause same events as doing the same thing via netlink. Some users use
>>> brctl (ioctl) and others use newer bridge (netlink) API.
>>
>> I'll make a third iteration to have the same notifications when using
>> ioctl() with details in the commit message.
>> --
>
> as long as the ioctl path for bridge port removal is generating a:
> RTM_DELLINK with AF_BRIDGE and
> RTM_NEWLINK with AF_UNSPEC with 'master' removed
>
> we should be good. These are the most important ones.
>
> are there other specific bridge-events missing ?. you might want to
> run `bridge monitor link` also. and un-slaving of a port also triggers
> fdb events which are visible under `bridge monitor fdb`

With the patch, bridge monitor link generates the same event with
ioctl() than with netlink (like for ip monitor link, netlink generates
one extra duplicate event when enslaving).

For bridge monitor fdb, there is a difference. With ioctl(), I don't get
the event for VLAN1:

Deleted ca:18:06:bc:f6:11 dev dummy1 vlan 1 master bridge0 permanent

I suppose this is an expected difference due to the inability to manage
VLAN-aware bridges with ioctl().
-- 
Use the fundamental control flow constructs.
- The Elements of Programming Style (Kernighan & Plauger)


[net-next v3] bridge: trigger RTM_NEWLINK when interface is modified by bridge ioctl

2017-09-21 Thread Vincent Bernat
Currently, there is a difference in netlink events received when an
interface is modified through bridge ioctl() or through netlink. This
patch generates additional events when an interface is added to or
removed from a bridge via ioctl().

When adding then removing an interface from a bridge with netlink, we
get:

5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master bridge0 
state UNKNOWN group default
link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master bridge0 
state UNKNOWN group default
link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff

5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master bridge0 
state UNKNOWN group default
link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
link/ether 9e:da:60:ee:cf:c8
Deleted 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state 
UNKNOWN
link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN 
group default
link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff

When using ioctl():

5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master bridge0 
state UNKNOWN group default
link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master bridge0 
state UNKNOWN group default
link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff

5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master bridge0 
state UNKNOWN group default
link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN
link/ether 9e:da:60:ee:cf:c8
Deleted 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state 
UNKNOWN
link/ether 9e:da:60:ee:cf:c8
5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN 
group default
link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff

Without this patch, the last netlink notification is not sent.

Signed-off-by: Vincent Bernat <vinc...@bernat.im>
---
 net/bridge/br_ioctl.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c
index 7970f8540cbb..66cd98772051 100644
--- a/net/bridge/br_ioctl.c
+++ b/net/bridge/br_ioctl.c
@@ -102,6 +102,9 @@ static int add_del_if(struct net_bridge *br, int ifindex, 
int isadd)
else
ret = br_del_if(br, dev);
 
+   if (!ret)
+   rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_MASTER, GFP_KERNEL);
+
return ret;
 }
 
-- 
2.14.1



Re: [PATCH net-next v2] bridge: also trigger RTM_NEWLINK when interface is released from bridge

2017-09-21 Thread Vincent Bernat
 ❦ 20 septembre 2017 16:21 -0700, Stephen Hemminger 
 :

> The one concern is that ports added or removed through ioctl should
> cause same events as doing the same thing via netlink. Some users use
> brctl (ioctl) and others use newer bridge (netlink) API.

I'll make a third iteration to have the same notifications when using
ioctl() with details in the commit message.
-- 
When in doubt, tell the truth.
-- Mark Twain


Re: [PATCH net-next v2] bridge: also trigger RTM_NEWLINK when interface is released from bridge

2017-09-20 Thread Vincent Bernat
 ❦ 20 septembre 2017 15:57 -0600, David Ahern  :

> The DELLINK is for AF_BRIDGE family (ifi_family). Adding family to
> print_linkinfo and running the monitor I get:
>
>
> [LINK]family 0: 35: dummy1:  mtu 1500 qdisc
> noqueue master br0 state UNKNOWN group default
> link/ether d6:c3:73:86:3c:73 brd ff:ff:ff:ff:ff:ff
>
> [LINK]family 7: 35: dummy1:  mtu 1500
> master br0 state UNKNOWN
> link/ether d6:c3:73:86:3c:73
>
> [LINK]Deleted family 7: 35: dummy1:  mtu
> 1500 master br0 state UNKNOWN
> link/ether d6:c3:73:86:3c:73

I didn't know about the family. We can drop the patch.
-- 
One of the most striking differences between a cat and a lie is that a cat has
only nine lives.
-- Mark Twain, "Pudd'nhead Wilson's Calendar"


[PATCH net-next v2] bridge: also trigger RTM_NEWLINK when interface is released from bridge

2017-09-16 Thread Vincent Bernat
Currently, when an interface is released from a bridge via
ioctl(), we get a RTM_DELLINK event through netlink:

Deleted 2: dummy0: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state 
UNKNOWN
link/ether 6e:23:c2:54:3a:b3

Userspace has to interpret that as a removal from the bridge, not as a
complete removal of the interface. When an bridged interface is
completely removed, we get two events:

Deleted 2: dummy0: <BROADCAST,NOARP> mtu 1500 master bridge0 state DOWN
link/ether 6e:23:c2:54:3a:b3
Deleted 2: dummy0: <BROADCAST,NOARP> mtu 1500 qdisc noop state DOWN group 
default
link/ether 6e:23:c2:54:3a:b3 brd ff:ff:ff:ff:ff:ff

In constrast, when an interface is released from a bond, we get a
RTM_NEWLINK with only the new characteristics (no master):

3: dummy1: <BROADCAST,NOARP,SLAVE,UP,LOWER_UP> mtu 1500 qdisc noqueue master 
bond0 state UNKNOWN group default
link/ether ae:dc:7a:8c:9a:3c brd ff:ff:ff:ff:ff:ff
3: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN 
group default
link/ether ae:dc:7a:8c:9a:3c brd ff:ff:ff:ff:ff:ff
4: bond0: <BROADCAST,MULTICAST,MASTER,UP,LOWER_UP> mtu 1500 qdisc noqueue state 
UP group default
link/ether ae:dc:7a:8c:9a:3c brd ff:ff:ff:ff:ff:ff
3: dummy1: <BROADCAST,NOARP> mtu 1500 qdisc noqueue state DOWN group default
link/ether ae:dc:7a:8c:9a:3c brd ff:ff:ff:ff:ff:ff
3: dummy1: <BROADCAST,NOARP> mtu 1500 qdisc noqueue state DOWN group default
link/ether ca:c8:7b:66:f8:25 brd ff:ff:ff:ff:ff:ff
4: bond0: <BROADCAST,MULTICAST,MASTER,UP,LOWER_UP> mtu 1500 qdisc noqueue state 
UP group default
link/ether ae:dc:7a:8c:9a:3c brd ff:ff:ff:ff:ff:ff

Userland may be confused by the fact we say a link is deleted while
its characteristics are only modified. A first solution would have
been to turn the RTM_DELLINK event in del_nbp() into a RTM_NEWLINK
event. However, maybe some piece of userland is relying on this
RTM_DELLINK to detect when a bridged interface is released. Instead,
we also emit a RTM_NEWLINK event once the interface is
released (without master info).

Deleted 2: dummy0: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state 
UNKNOWN
link/ether 8a:bb:e7:94:b1:f8
2: dummy0: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN 
group default
link/ether 8a:bb:e7:94:b1:f8 brd ff:ff:ff:ff:ff:ff

This is done only when using ioctl(). When using Netlink, such an
event is already automatically emitted in do_setlink().

Signed-off-by: Vincent Bernat <vinc...@bernat.im>
---
 net/bridge/br_ioctl.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c
index 7970f8540cbb..3148cb3a8e82 100644
--- a/net/bridge/br_ioctl.c
+++ b/net/bridge/br_ioctl.c
@@ -99,8 +99,10 @@ static int add_del_if(struct net_bridge *br, int ifindex, 
int isadd)
 
if (isadd)
ret = br_add_if(br, dev);
-   else
+   else {
ret = br_del_if(br, dev);
+   rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_MASTER, GFP_KERNEL);
+   }
 
return ret;
 }
-- 
2.14.1



Re: [PATCH net-next v1] bridge: also trigger RTM_NEWLINK when interface is released from bridge

2017-09-15 Thread Vincent Bernat
 ❦ 15 septembre 2017 21:38 +0200, Vincent Bernat <vinc...@bernat.im> :

> Currently, when an interface is released from a bridge, we get a
> RTM_DELLINK event through netlink:
>
> Deleted 2: dummy0: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 
> state UNKNOWN
> link/ether 6e:23:c2:54:3a:b3

It should be noted this only happens when using the ioctl API. When
using the netlink API, rtnetlink.c will send a RTM_NEWLINK because of
the modification.
-- 
When in doubt, tell the truth.
-- Mark Twain


[PATCH net-next v1] bridge: also trigger RTM_NEWLINK when interface is released from bridge

2017-09-15 Thread Vincent Bernat
Currently, when an interface is released from a bridge, we get a
RTM_DELLINK event through netlink:

Deleted 2: dummy0: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state 
UNKNOWN
link/ether 6e:23:c2:54:3a:b3

Userspace has to interpret that as a removal from the bridge, not as a
complete removal of the interface. When an bridged interface is
completely removed, we get two events:

Deleted 2: dummy0: <BROADCAST,NOARP> mtu 1500 master bridge0 state DOWN
link/ether 6e:23:c2:54:3a:b3
Deleted 2: dummy0: <BROADCAST,NOARP> mtu 1500 qdisc noop state DOWN group 
default
link/ether 6e:23:c2:54:3a:b3 brd ff:ff:ff:ff:ff:ff

In constrast, when an interface is released from a bond, we get a
RTM_NEWLINK with only the new characteristics (no master):

3: dummy1: <BROADCAST,NOARP,SLAVE,UP,LOWER_UP> mtu 1500 qdisc noqueue master 
bond0 state UNKNOWN group default
link/ether ae:dc:7a:8c:9a:3c brd ff:ff:ff:ff:ff:ff
3: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN 
group default
link/ether ae:dc:7a:8c:9a:3c brd ff:ff:ff:ff:ff:ff
4: bond0: <BROADCAST,MULTICAST,MASTER,UP,LOWER_UP> mtu 1500 qdisc noqueue state 
UP group default
link/ether ae:dc:7a:8c:9a:3c brd ff:ff:ff:ff:ff:ff
3: dummy1: <BROADCAST,NOARP> mtu 1500 qdisc noqueue state DOWN group default
link/ether ae:dc:7a:8c:9a:3c brd ff:ff:ff:ff:ff:ff
3: dummy1: <BROADCAST,NOARP> mtu 1500 qdisc noqueue state DOWN group default
link/ether ca:c8:7b:66:f8:25 brd ff:ff:ff:ff:ff:ff
4: bond0: <BROADCAST,MULTICAST,MASTER,UP,LOWER_UP> mtu 1500 qdisc noqueue state 
UP group default
link/ether ae:dc:7a:8c:9a:3c brd ff:ff:ff:ff:ff:ff

Userland may be confused by the fact we say a link is deleted while
its characteristics are only modified. A first solution would have
been to turn the RTM_DELLINK event in del_nbp() into a RTM_NEWLINK
event. However, maybe some piece of userland is relying on this
RTM_DELLINK to detect when a bridged interface is released. Instead,
we also emit a RTM_NEWLINK event once the interface is
released (without master info).

Deleted 2: dummy0: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state 
UNKNOWN
link/ether 8a:bb:e7:94:b1:f8
2: dummy0: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN 
group default
link/ether 8a:bb:e7:94:b1:f8 brd ff:ff:ff:ff:ff:ff

Signed-off-by: Vincent Bernat <vinc...@bernat.im>
---
 net/bridge/br_if.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index f3aef22931ab..636e0a842f8a 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -288,6 +288,8 @@ static void del_nbp(struct net_bridge_port *p)
 
dev->priv_flags &= ~IFF_BRIDGE_PORT;
 
+   rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_MASTER, GFP_KERNEL);
+
netdev_rx_handler_unregister(dev);
 
br_multicast_del_port(p);
-- 
2.14.1



[PATCH net-next v2] net: ipv6: avoid overhead when no custom FIB rules are installed

2017-08-08 Thread Vincent Bernat
If the user hasn't installed any custom rules, don't go through the
whole FIB rules layer. This is pretty similar to f4530fa574df (ipv4:
Avoid overhead when no custom FIB rules are installed).

Using a micro-benchmark module [1], timing ip6_route_output() with
get_cycles(), with 40,000 routes in the main routing table, before this
patch:

min=606 max=12911 count=627 average=1959 95th=4903 90th=3747 50th=1602 
mad=821
table=254 avgdepth=21.8 maxdepth=39
value │ ┊count
  600 │▒▒▒ 199
  880 │▒▒▒  43
 1160 │▒▒▒  48
 1440 │▒▒▒░░░   43
 1720 │░░░  59
 2000 │▒▒▒  50
 2280 │▒▒░░░26
 2560 │▒▒░  31
 2840 │▒▒   28
 3120 │▒░░  17
 3400 │▒░░░ 17
 3680 │░ 8
 3960 │░░   11
 4240 │░░6
 4520 │░░░   6
 4800 │░░░   9

After:

min=544 max=11687 count=627 average=1776 95th=4546 90th=3585 50th=1227 
mad=565
table=254 avgdepth=21.8 maxdepth=39
value │ ┊count
  540 │201
  800 │▒63
 1060 │▒░   68
 1320 │▒▒▒░░39
 1580 │▒▒░░ 32
 1840 │▒▒   32
 2100 │▒▒░░░34
 2360 │▒▒░░ 33
 2620 │▒▒   26
 2880 │▒░░  22
 3140 │  9
 3400 │░ 8
 3660 │░ 9
 3920 │░░8
 4180 │░░░   8
 4440 │░░░   8

At the frequency of the host during the bench (~ 3.7 GHz), this is
about a 100 ns difference on the median value.

A next step would be to collapse local and main tables, as in
0ddcf43d5d4a (ipv4: FIB Local/MAIN table collapse).

[1]: 
https://github.com/vincentbernat/network-lab/blob/master/lab-routes-ipv6/kbench_mod.c

Signed-off-by: Vincent Bernat <vinc...@bernat.im>
Reviewed-by: Jiri Pirko <j...@mellanox.com>
---
 include/net/netns/ipv6.h |  1 +
 net/ipv6/fib6_rules.c| 40 +++-
 net/ipv6/route.c |  1 +
 3 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index abdf3b40303b..0e50bf3ed097 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -65,6 +65,7 @@ struct netns_ipv6 {
unsigned int ip6_rt_gc_expire;
unsigned longip6_rt_last_gc;
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
+   bool fib6_has_custom_rules;
struct rt6_info *ip6_prohibit_entry;
struct rt6_info *ip6_blk_hole_entry;
struct fib6_table   *fib6_local_tbl;
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 2f29e4e33bd3..b240f24a6e52 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -63,19 +63,32 @@ unsigned int fib6_rules_seq_read(struct net *net)
 struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
   int flags, pol_lookup_t lookup)
 {
-   struct fib_lookup_arg arg = {
-   .lookup_ptr = lookup,
-   .flags = FIB_LOOKUP_NOREF,
-   };
-
-   /* update flow if oif or iif point to device enslaved to l3mdev */
-   l3mdev_update_flow(net, flowi6_to_flowi(fl6));
-
-   fib_rules_lookup(net->ipv6.fib6_rules_ops,
-flowi6_to_flowi(fl6), flags, );
-
-   if (arg.result)
-   return arg.result;

Re: [net-next v1] net: ipv6: avoid overhead when no custom FIB rules are installed

2017-08-08 Thread Vincent Bernat
 ❦  8 août 2017 08:46 -0600, David Ahern  :

>> diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
>> index 2f29e4e33bd3..693c27ede40e 100644
>> --- a/net/ipv6/fib6_rules.c
>> +++ b/net/ipv6/fib6_rules.c
>> @@ -63,19 +63,32 @@ unsigned int fib6_rules_seq_read(struct net *net)
>>  struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
>> int flags, pol_lookup_t lookup)
>>  {
>> -struct fib_lookup_arg arg = {
>> -.lookup_ptr = lookup,
>> -.flags = FIB_LOOKUP_NOREF,
>> -};
>> -
>>  /* update flow if oif or iif point to device enslaved to l3mdev */
>>  l3mdev_update_flow(net, flowi6_to_flowi(fl6));
>
> The l3mdev_update_flow can be moved to the has_custom_rules block.
> l3mdev requires FIB rules for the lookups to work, so no rules means no
> l3mdev configured.
>
> Rest looks good to me.

I suspected that it could be moved. I'll update the patch tomorrow.
-- 
Choose variable names that won't be confused.
- The Elements of Programming Style (Kernighan & Plauger)


[net-next v1] net: ipv6: avoid overhead when no custom FIB rules are installed

2017-08-08 Thread Vincent Bernat
If the user hasn't installed any custom rules, don't go through the
whole FIB rules layer. This is pretty similar to f4530fa574df (ipv4:
Avoid overhead when no custom FIB rules are installed).

Using a micro-benchmark module [1], timing ip6_route_output() using
get_cycles(), on my laptop with performance governor (so, subject to
some caution due to thermal throttling), with 40,000 routes in the main
routing table, before this patch:

min=603 max=12477 count=627 average=1926 95th=4518 90th=3661 50th=1600 
mad=724
table=254 avgdepth=21.8 maxdepth=39
value │ ┊count
  600 │▒▒  134
  860 │▒▒▒░░91
 1120 │▒▒▒░░39
 1380 │░58
 1640 │░░   57
 1900 │░░   58
 2160 │▒▒▒░░░   46
 2420 │▒▒░░ 26
 2680 │▒▒   27
 2940 │▒░░  15
 3200 │  8
 3460 │░ 8
 3720 │░░   12
 3980 │░░9
 4240 │░░░   7
 4500 │░░░   6

After:

min=486 max=12390 count=627 average=1715 95th=4365 90th=3352 50th=1324 
mad=631
table=254 avgdepth=21.8 maxdepth=39
value │ ┊count
  480 │153
  730 │▒▒▒  88
  980 │░░░  57
 1230 │ 56
 1480 │▒▒▒  43
 1730 │▒▒▒  50
 1980 │▒▒   31
 2230 │▒░░  23
 2480 │▒15
 2730 │▒░   23
 2980 │▒░░░ 20
 3230 │▒14
 3480 │░░4
 3730 │░░8
 3980 │░░░   9
 4230 │░░░   4

At the frequency of the laptop during the bench (~ 2.3 GHz), this is
about a 130 ns difference on the median value and 50 ns on the minimal
value.

A next step would be to collapse local and main tables, as in
0ddcf43d5d4a (ipv4: FIB Local/MAIN table collapse).

[1]: 
https://github.com/vincentbernat/network-lab/blob/master/lab-routes-ipv6/kbench_mod.c

Signed-off-by: Vincent Bernat <vinc...@bernat.im>
---
 include/net/netns/ipv6.h |  1 +
 net/ipv6/fib6_rules.c| 34 --
 net/ipv6/route.c |  1 +
 3 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index abdf3b40303b..0e50bf3ed097 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -65,6 +65,7 @@ struct netns_ipv6 {
unsigned int ip6_rt_gc_expire;
unsigned longip6_rt_last_gc;
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
+   bool fib6_has_custom_rules;
struct rt6_info *ip6_prohibit_entry;
struct rt6_info *ip6_blk_hole_entry;
struct fib6_table   *fib6_local_tbl;
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 2f29e4e33bd3..693c27ede40e 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -63,19 +63,32 @@ unsigned int fib6_rules_seq_read(struct net *net)
 struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
   int flags, pol_lookup_t lookup)
 {
-   struct fib_lookup_arg arg = {
-   .lookup_ptr = lookup,
-   .flags = FIB_LOOKUP_NOREF,
-   };
-
/* update flow if oif or iif point to device enslaved to l3mdev */
l3mdev_update_fl

[PATCH net-next v1] ip6: fix PMTU discovery when using /127 subnets

2017-07-15 Thread Vincent Bernat
The definition of an "anycast destination address" has been tweaked as a
side-effect of commit 2647a9b07032 ("ipv6: Remove external dependency on
rt6i_gateway and RTF_ANYCAST"). The first address of a point-to-point
/127 subnet is now considered as an anycast address. This prevents
ICMPv6 errors to be returned to a sender of such a subnet and breaks
PMTU discovery.

This can be reproduced with:

ip link add name out6 type veth peer name in6
ip link add name out7 type veth peer name in7
ip link set mtu 1400 dev out7
ip link set mtu 1400 dev in7
ip netns add next-hop
ip netns add next-next-hop
ip link set netns next-hop dev in6
ip link set netns next-hop dev out7
ip link set netns next-next-hop dev in7
ip link set up dev out6
ip addr add 2001:db8:1::12/127 dev out6
ip netns exec next-hop ip link set up dev in6
ip netns exec next-hop ip link set up dev out7
ip netns exec next-hop ip addr add 2001:db8:1::13/127 dev in6
ip netns exec next-hop ip addr add 2001:db8:1::14/127 dev out7
ip netns exec next-hop ip route add default via 2001:db8:1::15
ip netns exec next-hop sysctl -qw net.ipv6.conf.all.forwarding=1
ip netns exec next-next-hop ip link set up dev in7
ip netns exec next-next-hop ip addr add 2001:db8:1::15/127 dev in7
ip netns exec next-next-hop ip addr add 2001:db8:1::50/128 dev in7
ip netns exec next-next-hop ip route add default via 2001:db8:1::14
ip netns exec next-next-hop sysctl -qw net.ipv6.conf.all.forwarding=1
ip route add 2001:db8:1::48/123 via 2001:db8:1::13
sleep 4
ping -M do -s 1452 -c 3 2001:db8:1::50 || true
ip route get 2001:db8:1::50

Before the patch, we get:

2001:db8:1::50 from :: via 2001:db8:1::13 dev out6 src 2001:db8:1::12 
metric 1024  pref medium

After the patch, we get:

2001:db8:1::50 via 2001:db8:1::13 dev out6 src 2001:db8:1::12 metric 0
cache  expires 578sec mtu 1400 pref medium

Fixes: 2647a9b07032 ("ipv6: Remove external dependency on rt6i_gateway and 
RTF_ANYCAST")
Signed-off-by: Vincent Bernat <vinc...@bernat.im>
---
 include/net/ip6_route.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 199056933dcb..907d39a42f6b 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -194,7 +194,7 @@ static inline bool ipv6_anycast_destination(const struct 
dst_entry *dst,
struct rt6_info *rt = (struct rt6_info *)dst;
 
return rt->rt6i_flags & RTF_ANYCAST ||
-   (rt->rt6i_dst.plen != 128 &&
+   (rt->rt6i_dst.plen < 127 &&
 ipv6_addr_equal(>rt6i_dst.addr, daddr));
 }
 
-- 
2.13.2



[net-next] net: remove policy-routing.txt documentation

2017-06-27 Thread Vincent Bernat
It dates back from 2.1.16 and is obsolete since 2.1.68 when the current
rule system has been introduced.

Signed-off-by: Vincent Bernat <vinc...@bernat.im>
---
 Documentation/networking/policy-routing.txt | 150 
 1 file changed, 150 deletions(-)
 delete mode 100644 Documentation/networking/policy-routing.txt

diff --git a/Documentation/networking/policy-routing.txt 
b/Documentation/networking/policy-routing.txt
deleted file mode 100644
index 36f6936d7f21..
--- a/Documentation/networking/policy-routing.txt
+++ /dev/null
@@ -1,150 +0,0 @@
-Classes

-
-   "Class" is a complete routing table in common sense.
-   I.e. it is tree of nodes (destination prefix, tos, metric)
-   with attached information: gateway, device etc.
-   This tree is looked up as specified in RFC1812 5.2.4.3
-   1. Basic match
-   2. Longest match
-   3. Weak TOS.
-   4. Metric. (should not be in kernel space, but they are)
-   5. Additional pruning rules. (not in kernel space).
-   
-   We have two special type of nodes:
-   REJECT - abort route lookup and return an error value.
-   THROW  - abort route lookup in this class.
-
-
-   Currently the number of classes is limited to 255
-   (0 is reserved for "not specified class")
-
-   Three classes are builtin:
-
-   RT_CLASS_LOCAL=255 - local interface addresses,
-   broadcasts, nat addresses.
-
-   RT_CLASS_MAIN=254  - all normal routes are put there
-   by default.
-
-   RT_CLASS_DEFAULT=253 - if ip_fib_model==1, then
-   normal default routes are put there, if ip_fib_model==2
-   all gateway routes are put there.
-
-
-Rules
--
-   Rule is a record of (src prefix, src interface, tos, dst prefix)
-   with attached information.
-
-   Rule types:
-   RTP_ROUTE - lookup in attached class
-   RTP_NAT   - lookup in attached class and if a match is found,
-   translate packet source address.
-   RTP_MASQUERADE - lookup in attached class and if a match is found,
-   masquerade packet as sourced by us.
-   RTP_DROP   - silently drop the packet.
-   RTP_REJECT - drop the packet and send ICMP NET UNREACHABLE.
-   RTP_PROHIBIT - drop the packet and send ICMP COMM. ADM. PROHIBITED.
-
-   Rule flags:
-   RTRF_LOG - log route creations.
-   RTRF_VALVE - One way route (used with masquerading)
-
-Default setup:
-
-root@amber:/pub/ip-routing # iproute -r
-Kernel routing policy rules
-Pref Source DestinationTOS Iface   Cl
-   0 defaultdefault00  *   255
- 254 defaultdefault00  *   254
- 255 defaultdefault00  *   253
-
-
-Lookup algorithm
-
-
-   We scan rules list, and if a rule is matched, apply it.
-   If a route is found, return it.
-   If it is not found or a THROW node was matched, continue
-   to scan rules.
-
-Applications
-
-
-1. Just ignore classes. All the routes are put into MAIN class
-   (and/or into DEFAULT class).
-
-   HOWTO:  iproute add PREFIX [ tos TOS ] [ gw GW ] [ dev DEV ]
-   [ metric METRIC ] [ reject ] ... (look at iproute utility)
-
-   or use route utility from current net-tools.
-   
-2. Opposite case. Just forget all that you know about routing
-   tables. Every rule is supplied with its own gateway, device
-   info. record. This approach is not appropriate for automated
-   route maintenance, but it is ideal for manual configuration.
-
-   HOWTO:  iproute addrule [ from PREFIX ] [ to PREFIX ] [ tos TOS ]
-   [ dev INPUTDEV] [ pref PREFERENCE ] route [ gw GATEWAY ]
-   [ dev OUTDEV ] .
-
-   Warning: As of now the size of the routing table in this
-   approach is limited to 256. If someone likes this model, I'll
-   relax this limitation.
-
-3. OSPF classes (see RFC1583, RFC1812 E.3.3)
-   Very clean, stable and robust algorithm for OSPF routing
-   domains. Unfortunately, it is not widely used in the Internet.
-
-   Proposed setup:
-   255 local addresses
-   254 interface routes
-   253 ASE routes with external metric
-   252 ASE routes with internal metric
-   251 inter-area routes
-   250 intra-area routes for 1st area
-   249 intra-area routes for 2nd area
-   etc.
-   
-   Rules:
-   iproute addrule class 253
-   iproute addrule class 252
-   iproute addrule class 251
-   iproute addrule to a-prefix-for-1st-area class 250
-   iproute addrule to another-prefix-for-1st-area class 250
-   ...
-   iproute addrule to a-prefix-for-2nd-area class 249
-   ...
-
-   Area classes must be terminated with reject record.
-   iproute add default reject class 250
-   iproute ad

[PATCH net-next v4] vxlan: fix ND proxy when skb doesn't have transport header offset

2017-04-02 Thread Vincent Bernat
When an incoming frame is tagged or when GRO is disabled, the skb
handled to vxlan_xmit() doesn't contain a valid transport header
offset. This makes ND proxying fail.

We combine two changes: replace use of skb_transport_offset() and ensure
the necessary amount of skb is linear just before using it:

 - In vxlan_xmit(), when determining if we have an ICMPv6 neighbor
   discovery packet, just check if it is an ICMPv6 packet and rely on
   neigh_reduce() to do more checks if this is the case. The use of
   pskb_may_pull() is replaced by skb_header_pointer() for just the IPv6
   header.

 - In neigh_reduce(), add pskb_may_pull() for IPv6 header and neighbor
   discovery message since this was removed from vxlan_xmit(). Replace
   skb_transport_header() with ipv6_hdr() + 1.

 - In vxlan_na_create(), replace first skb_transport_offset() with
   ipv6_hdr() + 1 and second with skb_network_offset() + sizeof(struct
   ipv6hdr). Additionally, ensure we pskb_may_pull() the whole skb as we
   need it to iterate over the options.

Signed-off-by: Vincent Bernat <vinc...@bernat.im>
---
 drivers/net/vxlan.c | 29 +++--
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 1e54fb5c883a..89cb8267 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1515,7 +1515,7 @@ static struct sk_buff *vxlan_na_create(struct sk_buff 
*request,
int ns_olen;
int i, len;
 
-   if (dev == NULL)
+   if (dev == NULL || !pskb_may_pull(request, request->len))
return NULL;
 
len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) +
@@ -1530,10 +1530,11 @@ static struct sk_buff *vxlan_na_create(struct sk_buff 
*request,
skb_push(reply, sizeof(struct ethhdr));
skb_reset_mac_header(reply);
 
-   ns = (struct nd_msg *)skb_transport_header(request);
+   ns = (struct nd_msg *)(ipv6_hdr(request) + 1);
 
daddr = eth_hdr(request)->h_source;
-   ns_olen = request->len - skb_transport_offset(request) - sizeof(*ns);
+   ns_olen = request->len - skb_network_offset(request) -
+   sizeof(struct ipv6hdr) - sizeof(*ns);
for (i = 0; i < ns_olen-1; i += (ns->opt[i+1]<<3)) {
if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) {
daddr = ns->opt + i + sizeof(struct nd_opt_hdr);
@@ -1604,10 +1605,13 @@ static int neigh_reduce(struct net_device *dev, struct 
sk_buff *skb, __be32 vni)
if (!in6_dev)
goto out;
 
+   if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + sizeof(struct nd_msg)))
+   goto out;
+
iphdr = ipv6_hdr(skb);
daddr = >daddr;
 
-   msg = (struct nd_msg *)skb_transport_header(skb);
+   msg = (struct nd_msg *)(iphdr + 1);
if (msg->icmph.icmp6_code != 0 ||
msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION)
goto out;
@@ -2242,16 +2246,13 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, 
struct net_device *dev)
if (ntohs(eth->h_proto) == ETH_P_ARP)
return arp_reduce(dev, skb, vni);
 #if IS_ENABLED(CONFIG_IPV6)
-   else if (ntohs(eth->h_proto) == ETH_P_IPV6 &&
-pskb_may_pull(skb, sizeof(struct ipv6hdr)
-  + sizeof(struct nd_msg)) &&
-ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
-   struct nd_msg *msg;
-
-   msg = (struct nd_msg 
*)skb_transport_header(skb);
-   if (msg->icmph.icmp6_code == 0 &&
-   msg->icmph.icmp6_type == 
NDISC_NEIGHBOUR_SOLICITATION)
-   return neigh_reduce(dev, skb, vni);
+   else if (ntohs(eth->h_proto) == ETH_P_IPV6) {
+   struct ipv6hdr *hdr, _hdr;
+   if ((hdr = skb_header_pointer(skb,
+ skb_network_offset(skb),
+ sizeof(_hdr), &_hdr)) &&
+   hdr->nexthdr == IPPROTO_ICMPV6)
+   return neigh_reduce(dev, skb, vni);
}
 #endif
}
-- 
2.11.0



[net-next v3] vxlan: fix ND proxy when skb doesn't have transport header offset

2017-03-31 Thread Vincent Bernat
When an incoming frame is tagged or when GRO is disabled, the skb
handled to vxlan_xmit() doesn't contain a valid transport header
offset. This makes ND proxying fail.

We combine two changes: replace use of skb_transport_offset() and ensure
the necessary amount of skb is linear just before using it:

 - In vxlan_xmit(), when determining if we have an ICMPv6 neighbor
   discovery packet, just check if it is an ICMPv6 packet and rely on
   neigh_reduce() to do more checks if this is the case. The use of
   pskb_may_pull() is replaced by skb_header_pointer() for just the IPv6
   header.

 - In neigh_reduce(), add pskb_may_pull() for IPv6 header and neighbor
   discovery message since this was removed from vxlan_xmit(). Replace
   skb_transport_header() with ipv6_hdr() + 1.

 - In vxlan_na_create(), replace first skb_transport_offset() with
   ipv6_hdr() + 1 and second with skb_network_offset() + sizeof(struct
   ipv6hdr). Additionally, ensure we pskb_may_pull() the whole skb as we
   need it to iterate over the options.

Signed-off-by: Vincent Bernat <vinc...@bernat.im>
---
 drivers/net/vxlan.c | 28 ++--
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 1e54fb5c883a..54dda367de2b 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1515,7 +1515,7 @@ static struct sk_buff *vxlan_na_create(struct sk_buff 
*request,
int ns_olen;
int i, len;
 
-   if (dev == NULL)
+   if (dev == NULL || !pskb_may_pull(request, request->len))
return NULL;
 
len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) +
@@ -1530,10 +1530,11 @@ static struct sk_buff *vxlan_na_create(struct sk_buff 
*request,
skb_push(reply, sizeof(struct ethhdr));
skb_reset_mac_header(reply);
 
-   ns = (struct nd_msg *)skb_transport_header(request);
+   ns = (struct nd_msg *)(ipv6_hdr(request) + 1);
 
daddr = eth_hdr(request)->h_source;
-   ns_olen = request->len - skb_transport_offset(request) - sizeof(*ns);
+   ns_olen = request->len - skb_network_offset(request) -
+   sizeof(struct ipv6hdr) - sizeof(*ns);
for (i = 0; i < ns_olen-1; i += (ns->opt[i+1]<<3)) {
if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) {
daddr = ns->opt + i + sizeof(struct nd_opt_hdr);
@@ -1604,10 +1605,13 @@ static int neigh_reduce(struct net_device *dev, struct 
sk_buff *skb, __be32 vni)
if (!in6_dev)
goto out;
 
+   if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + sizeof(struct nd_msg)))
+   goto out;
+
iphdr = ipv6_hdr(skb);
daddr = >daddr;
 
-   msg = (struct nd_msg *)skb_transport_header(skb);
+   msg = (struct nd_msg *)(iphdr + 1);
if (msg->icmph.icmp6_code != 0 ||
msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION)
goto out;
@@ -2238,21 +2242,17 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, 
struct net_device *dev)
}
 
if (vxlan->flags & VXLAN_F_PROXY) {
+   struct ipv6hdr *hdr, _hdr;
eth = eth_hdr(skb);
if (ntohs(eth->h_proto) == ETH_P_ARP)
return arp_reduce(dev, skb, vni);
 #if IS_ENABLED(CONFIG_IPV6)
else if (ntohs(eth->h_proto) == ETH_P_IPV6 &&
-pskb_may_pull(skb, sizeof(struct ipv6hdr)
-  + sizeof(struct nd_msg)) &&
-ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
-   struct nd_msg *msg;
-
-   msg = (struct nd_msg 
*)skb_transport_header(skb);
-   if (msg->icmph.icmp6_code == 0 &&
-   msg->icmph.icmp6_type == 
NDISC_NEIGHBOUR_SOLICITATION)
-   return neigh_reduce(dev, skb, vni);
-   }
+(hdr = skb_header_pointer(skb,
+  skb_network_offset(skb),
+  sizeof(_hdr), &_hdr)) &&
+hdr->nexthdr == IPPROTO_ICMPV6)
+   return neigh_reduce(dev, skb, vni);
 #endif
}
 
-- 
2.11.0



Re: [net-next v2] vxlan: fix ND proxy when skb doesn't have transport header offset

2017-03-30 Thread Vincent Bernat
 ❦ 30 mars 2017 06:36 -0700, Eric Dumazet  :

>>> Parsing of neighbor discovery options is done earlier to ignore the
>>> whole packet in case of a malformed option. Moreover, the assumption the
>>> skb was linear is removed and options are extracted with
>>> skb_header_pointer() as well. The check on the source link-layer address
>>> option is also more strict (for Ethernet, we expect the length field to
>>> be 1).
>>
>> There is some parsing implemented in net/ipv6/ndisc.c, notably
>> ndisc_parse_options(). I don't know if this is a good idea to reuse
>> that: it may have the expectation that some IP processing has already
>> been done (for example, the IPv6 length has already been checked, the
>> SKB is expected to be linear).
>
> Forcing ICMP being linear is probably fine.
>
> Prior to 91269e390d062b52 ("vxlan: using pskb_may_pull as early as possible")
> this was happening (at the wrong place) in neigh_reduce() doing a :
> if (!pskb_may_pull(skb, skb->len))
>  goto out;

OK, I'll simplify the patch then.

> So the following would be ok (while incomplete of course)
>
> diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
> index 
> bdb6ae16d4a85bf9539199e189011bce104ba51a..cd032819d4a36d5ca94739f20f947f3f5a31aba3
> 100644
> --- a/drivers/net/vxlan.c
> +++ b/drivers/net/vxlan.c
> @@ -2243,12 +2243,13 @@ static netdev_tx_t vxlan_xmit(struct sk_buff
> *skb, struct net_device *dev)
> return arp_reduce(dev, skb, vni);
>  #if IS_ENABLED(CONFIG_IPV6)
> else if (ntohs(eth->h_proto) == ETH_P_IPV6 &&
> -pskb_may_pull(skb, sizeof(struct ipv6hdr)
> -  + sizeof(struct nd_msg)) &&
> +skb->len >= sizeof(struct ipv6hdr) +
> +sizeof(struct nd_msg) &&
> +pskb_may_pull(skb, skb->len) &&
>  ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
> struct nd_msg *msg;
>
> -   msg = (struct nd_msg
> *)skb_transport_header(skb);
> +   msg = (struct nd_msg *)(ipv6_hdr(skb) + 1);
> if (msg->icmph.icmp6_code == 0 &&
> msg->icmph.icmp6_type ==
> NDISC_NEIGHBOUR_SOLICITATION)
> return neigh_reduce(dev, skb, vni);

pskb_may_pull() is called while we only know this is an IPv6 packet, not
an ICMPv6 one. I'll keep skb_header_pointer to handle IPv6 header, then
I'll pull the whole ICMP packet (unless I am mistaken, of course).
-- 
The devil can cite Scripture for his purpose.
-- William Shakespeare, "The Merchant of Venice"


Re: [net-next v2] vxlan: fix ND proxy when skb doesn't have transport header offset

2017-03-30 Thread Vincent Bernat
 ❦ 29 mars 2017 22:47 +0200, Vincent Bernat <vinc...@bernat.im> :

> Parsing of neighbor discovery options is done earlier to ignore the
> whole packet in case of a malformed option. Moreover, the assumption the
> skb was linear is removed and options are extracted with
> skb_header_pointer() as well. The check on the source link-layer address
> option is also more strict (for Ethernet, we expect the length field to
> be 1).

There is some parsing implemented in net/ipv6/ndisc.c, notably
ndisc_parse_options(). I don't know if this is a good idea to reuse
that: it may have the expectation that some IP processing has already
been done (for example, the IPv6 length has already been checked, the
SKB is expected to be linear).
-- 
Watch out for off-by-one errors.
- The Elements of Programming Style (Kernighan & Plauger)


[net-next v2] vxlan: fix ND proxy when skb doesn't have transport header offset

2017-03-29 Thread Vincent Bernat
When an incoming frame is tagged or when GRO is disabled, the skb
handled to vxlan_xmit() doesn't contain a valid transport header
offset. This makes ND proxying fail.

Do not rely on skb_transport_offset(). Instead, use offsets from
skb_network_offset() with skb_header_pointer() to extract appropriate
information to detect an ICMPv6 neighbor solicitation and to extract the
information needed to build the answer.

Duplicate checks at the beginning of neigh_reduce() are removed.

Parsing of neighbor discovery options is done earlier to ignore the
whole packet in case of a malformed option. Moreover, the assumption the
skb was linear is removed and options are extracted with
skb_header_pointer() as well. The check on the source link-layer address
option is also more strict (for Ethernet, we expect the length field to
be 1).

After this change, the vxlan module is correctly able to answer to
VLAN-encapsulated frames:

22:02:46.332573 50:54:33:00:00:02 > 33:33:00:00:00:02, ethertype 802.1Q 
(0x8100), length 74: vlan 100, p 0,ethertype IPv6, (hlim 255, next-header 
ICMPv6 (58) payload length: 16) fe80::5254:33ff:fe00:2 > ff02::2: [icmp6 sum 
ok] ICMP6, router solicitation, length 16
  source link-address option (1), length 8 (1): 50:54:33:00:00:02
0x:  5054 3300 0002
22:02:47.846631 50:54:33:00:00:08 > 33:33:00:00:00:02, ethertype 802.1Q 
(0x8100), length 74: vlan 100, p 0,ethertype IPv6, (hlim 255, next-header 
ICMPv6 (58) payload length: 16) fe80::5254:33ff:fe00:8 > ff02::2: [icmp6 sum 
ok] ICMP6, router solicitation, length 16
  source link-address option (1), length 8 (1): 50:54:33:00:00:08
0x:  5054 3300 0008

Signed-off-by: Vincent Bernat <vinc...@bernat.im>
---
 drivers/net/vxlan.c | 85 ++---
 1 file changed, 48 insertions(+), 37 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 1e54fb5c883a..f40272785aa2 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1504,20 +1504,43 @@ static int arp_reduce(struct net_device *dev, struct 
sk_buff *skb, __be32 vni)
 
 #if IS_ENABLED(CONFIG_IPV6)
 static struct sk_buff *vxlan_na_create(struct sk_buff *request,
-   struct neighbour *n, bool isrouter)
+  struct ipv6hdr *ip6h, struct nd_msg *ns,
+  struct neighbour *n, bool isrouter)
 {
struct net_device *dev = request->dev;
struct sk_buff *reply;
-   struct nd_msg *ns, *na;
+   struct nd_msg *na;
struct ipv6hdr *pip6;
-   u8 *daddr;
+   u8 *daddr, _daddr[ETH_ALEN];
int na_olen = 8; /* opt hdr + ETH_ALEN for target */
-   int ns_olen;
-   int i, len;
+   int len, remaining, offset;
 
if (dev == NULL)
return NULL;
 
+   /* Destination address is the "source link-layer address"
+* option if present and valid or the source Ethernet
+* address */
+   daddr = eth_hdr(request)->h_source;
+   remaining = htons(ip6h->payload_len) - sizeof(*ns);
+   offset = skb_network_offset(request) + sizeof(*ip6h) + sizeof(*ns);
+   while (remaining > sizeof(struct nd_opt_hdr)) {
+   struct nd_opt_hdr *ohdr, _ohdr;
+   ohdr = skb_header_pointer(request, offset, sizeof(_ohdr), 
&_ohdr);
+   if (!ohdr || !(len = ohdr->nd_opt_len<<3) || len > remaining)
+   return NULL;
+   if (ohdr->nd_opt_type == ND_OPT_SOURCE_LL_ADDR &&
+   len == na_olen) {
+   daddr = skb_header_pointer(request, offset + 
sizeof(_ohdr),
+  sizeof(_daddr), _daddr);
+   if (!daddr)
+   return NULL;
+   break;
+   }
+   remaining -= len;
+   offset += len;
+   }
+
len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) +
sizeof(*na) + na_olen + dev->needed_tailroom;
reply = alloc_skb(len, GFP_ATOMIC);
@@ -1530,16 +1553,6 @@ static struct sk_buff *vxlan_na_create(struct sk_buff 
*request,
skb_push(reply, sizeof(struct ethhdr));
skb_reset_mac_header(reply);
 
-   ns = (struct nd_msg *)skb_transport_header(request);
-
-   daddr = eth_hdr(request)->h_source;
-   ns_olen = request->len - skb_transport_offset(request) - sizeof(*ns);
-   for (i = 0; i < ns_olen-1; i += (ns->opt[i+1]<<3)) {
-   if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) {
-   daddr = ns->opt + i + sizeof(struct nd_opt_hdr);
-   break;
-   }
-   }
 
/* Ethernet header */
ether_addr_copy(eth_hdr(reply)->h_dest, daddr);
@@ -1556,10 +1569,10 @@ static struct sk_buff *vxlan_na_create(struct sk_buff 
*reque

[net-next v1] vxlan: use appropriate family on L3 miss

2017-03-10 Thread Vincent Bernat
When sending a L3 miss, the family is set to AF_INET even for IPv6. This
causes userland (eg "ip monitor") to be confused. Ensure we send the
appropriate family in this case. For L2 miss, keep using AF_INET.

Signed-off-by: Vincent Bernat <vinc...@bernat.im>
---
 drivers/net/vxlan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index e375560cc74e..168257aa8ace 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -276,9 +276,9 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct 
vxlan_dev *vxlan,
send_eth = send_ip = true;
 
if (type == RTM_GETNEIGH) {
-   ndm->ndm_family = AF_INET;
send_ip = !vxlan_addr_any(>remote_ip);
send_eth = !is_zero_ether_addr(fdb->eth_addr);
+   ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : 
AF_INET;
} else
ndm->ndm_family = AF_BRIDGE;
ndm->ndm_state = fdb->state;
-- 
2.11.0



[PATCH] vxlan: use preferred address family when neither group or remote is specified

2017-03-09 Thread Vincent Bernat
When neither group or remote is specified (or if they are specified with
the any address), nothing is sent to the kernel. In this case, the
kernel defaults to IPv4. This makes impossible to use IPv6 with
unspecified unicast remote ("bridge fdb add" will return
EAFNOTSUPPORT).

If the user specifies a preferred address family (eg, "ip -6 link add"),
then send either IFLA_VXLAN_GROUP or IFLA_VXLAN_GROUP6 to enforce the
use of the appropriate family.

Signed-off-by: Vincent Bernat <vinc...@bernat.im>
---
 ip/iplink_vxlan.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/ip/iplink_vxlan.c b/ip/iplink_vxlan.c
index 6d02bb47b2f0..fef7d3af4990 100644
--- a/ip/iplink_vxlan.c
+++ b/ip/iplink_vxlan.c
@@ -286,10 +286,14 @@ static int vxlan_parse_opt(struct link_util *lu, int 
argc, char **argv,
addattr_l(n, 1024, IFLA_VXLAN_GROUP, , 4);
else if (daddr)
addattr_l(n, 1024, IFLA_VXLAN_GROUP, , 4);
-   if (!IN6_IS_ADDR_UNSPECIFIED())
+   else if (!IN6_IS_ADDR_UNSPECIFIED())
addattr_l(n, 1024, IFLA_VXLAN_GROUP6, , sizeof(struct 
in6_addr));
else if (!IN6_IS_ADDR_UNSPECIFIED())
addattr_l(n, 1024, IFLA_VXLAN_GROUP6, , sizeof(struct 
in6_addr));
+   else if (preferred_family == AF_INET)
+   addattr_l(n, 1024, IFLA_VXLAN_GROUP, , 4);
+   else if (preferred_family == AF_INET6)
+   addattr_l(n, 1024, IFLA_VXLAN_GROUP6, , sizeof(struct 
in6_addr));
 
if (saddr)
addattr_l(n, 1024, IFLA_VXLAN_LOCAL, , 4);
-- 
2.11.0



Re: [PATCH] net: ethtool: don't require CAP_NET_ADMIN for ETHTOOL_GLINKSETTINGS

2016-12-24 Thread Vincent Bernat
 ❦ 24 novembre 2016 10:55 +0100, Miroslav Lichvar  :

> The ETHTOOL_GLINKSETTINGS command is deprecating the ETHTOOL_GSET
> command and likewise it shouldn't require the CAP_NET_ADMIN
> capability.

Could this patch be pushed to stable branches too?
-- 
Each module should do one thing well.
- The Elements of Programming Style (Kernighan & Plauger)


Re: [PATCH] net: ipv6: fallback to full lookup if table lookup is unsuitable

2016-09-19 Thread Vincent Bernat
 ❦ 19 septembre 2016 06:58 CEST, David Miller  :

>> @@ -1808,6 +1808,30 @@ static struct rt6_info *ip6_nh_lookup_table(struct 
>> net *net,
>>  return rt;
>>  }
>>  
>> +static int ip6_nh_valid(struct rt6_info *grt,
>> +struct net_device **dev, struct inet6_dev **idev) {
>> +int ret = 0;
>
> First, this is not formatted properly.  The openning brace should start
> on a new line.
>
> Second, please use "bool", "true", and "false" for the return value.

Noted for the next time. However, the v3 version of the patch doesn't
have the function anymore.
-- 
Avoid temporary variables.
- The Elements of Programming Style (Kernighan & Plauger)


[v3] net: ipv6: fallback to full lookup if table lookup is unsuitable

2016-09-18 Thread Vincent Bernat
Commit 8c14586fc320 ("net: ipv6: Use passed in table for nexthop
lookups") introduced a regression: insertion of an IPv6 route in a table
not containing the appropriate connected route for the gateway but which
contained a non-connected route (like a default gateway) fails while it
was previously working:

$ ip link add eth0 type dummy
$ ip link set up dev eth0
$ ip addr add 2001:db8::1/64 dev eth0
$ ip route add ::/0 via 2001:db8::5 dev eth0 table 20
$ ip route add 2001:db8:cafe::1/128 via 2001:db8::6 dev eth0 table 20
RTNETLINK answers: No route to host
$ ip -6 route show table 20
default via 2001:db8::5 dev eth0  metric 1024  pref medium

After this patch, we get:

$ ip route add 2001:db8:cafe::1/128 via 2001:db8::6 dev eth0 table 20
$ ip -6 route show table 20
2001:db8:cafe::1 via 2001:db8::6 dev eth0  metric 1024  pref medium
default via 2001:db8::5 dev eth0  metric 1024  pref medium

Fixes: 8c14586fc320 ("net: ipv6: Use passed in table for nexthop lookups")
Signed-off-by: Vincent Bernat <vinc...@bernat.im>
---
 net/ipv6/route.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ad4a7ff301fc..ec33c6d7eed5 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1991,9 +1991,18 @@ static struct rt6_info *ip6_route_info_create(struct 
fib6_config *cfg)
if (!(gwa_type & IPV6_ADDR_UNICAST))
goto out;
 
-   if (cfg->fc_table)
+   if (cfg->fc_table) {
grt = ip6_nh_lookup_table(net, cfg, gw_addr);
 
+   if (grt) {
+   if (grt->rt6i_flags & RTF_GATEWAY ||
+   (dev && dev != grt->dst.dev)) {
+   ip6_rt_put(grt);
+   grt = NULL;
+   }
+   }
+   }
+
if (!grt)
grt = rt6_lookup(net, gw_addr, NULL,
 cfg->fc_ifindex, 1);
-- 
2.9.3



[v2] net: ipv6: fallback to full lookup if table lookup is unsuitable

2016-09-16 Thread Vincent Bernat
Commit 8c14586fc320 ("net: ipv6: Use passed in table for nexthop
lookups") introduced a regression: insertion of an IPv6 route in a table
not containing the appropriate connected route for the gateway but which
contained a non-connected route (like a default gateway) fails while it
was previously working:

$ ip link add eth0 type dummy
$ ip link set up dev eth0
$ ip addr add 2001:db8::1/64 dev eth0
$ ip route add ::/0 via 2001:db8::5 dev eth0 table 20
$ ip route add 2001:db8:cafe::1/128 via 2001:db8::6 dev eth0 table 20
RTNETLINK answers: No route to host
$ ip -6 route show table 20
default via 2001:db8::5 dev eth0  metric 1024  pref medium

After this patch, we get:

$ ip route add 2001:db8:cafe::1/128 via 2001:db8::6 dev eth0 table 20
$ ip -6 route show table 20
2001:db8:cafe::1 via 2001:db8::6 dev eth0  metric 1024  pref medium
default via 2001:db8::5 dev eth0  metric 1024  pref medium

Signed-off-by: Vincent Bernat <vinc...@bernat.im>
---
 net/ipv6/route.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ad4a7ff301fc..2c6c7257ff75 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1994,6 +1994,14 @@ static struct rt6_info *ip6_route_info_create(struct 
fib6_config *cfg)
if (cfg->fc_table)
grt = ip6_nh_lookup_table(net, cfg, gw_addr);
 
+   if (grt) {
+   if (grt->rt6i_flags & RTF_GATEWAY ||
+   (dev && dev != grt->dst.dev)) {
+   ip6_rt_put(grt);
+   grt = NULL;
+   }
+   }
+
if (!grt)
grt = rt6_lookup(net, gw_addr, NULL,
 cfg->fc_ifindex, 1);
-- 
2.9.3



Re: [PATCH] net: ipv6: fallback to full lookup if table lookup is unsuitable

2016-09-16 Thread Vincent Bernat
 ❦ 16 septembre 2016 20:36 CEST, David Ahern  :

>> contained a non-connected route (like a default gateway) fails while it
>> was previously working:
>> 
>> $ ip link add eth0 type dummy
>> $ ip link set up dev eth0
>> $ ip addr add 2001:db8::1/64 dev eth0
>> $ ip route add ::/0 via 2001:db8::5 dev eth0 table 20
>> $ ip route add 2001:db8:cafe::1/128 via 2001:db8::6 dev eth0 table 20
>> RTNETLINK answers: No route to host
>> $ ip -6 route show table 20
>> default via 2001:db8::5 dev eth0  metric 1024  pref medium
>
> so your table 20 is not complete in that it lacks a connected route to
> resolve 2001:db8::6 as a nexthop, so you are relying on a fallback to
> other tables (main in this case).

Yes.

>> @@ -1991,33 +2015,15 @@ static struct rt6_info *ip6_route_info_create(struct 
>> fib6_config *cfg)
>>  if (!(gwa_type & IPV6_ADDR_UNICAST))
>>  goto out;
>>  
>> +err = -EHOSTUNREACH;
>>  if (cfg->fc_table)
>>  grt = ip6_nh_lookup_table(net, cfg, gw_addr);
>
> -8<-
>
>> -if (!(grt->rt6i_flags & RTF_GATEWAY))
>> -err = 0;
>
> This is the check that is failing for your use
> case. ip6_nh_lookup_table is returning the default route and nexthops
> can not rely on a gateway. Given that a simpler and more direct change
> is (whitespace mangled on paste):
>
> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index ad4a7ff301fc..48bae2ee2e18 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -1991,9 +1991,19 @@ static struct rt6_info *ip6_route_info_create(struct 
> fib6_config *cfg)
> if (!(gwa_type & IPV6_ADDR_UNICAST))
> goto out;
>
> -   if (cfg->fc_table)
> +   if (cfg->fc_table) {
> grt = ip6_nh_lookup_table(net, cfg, gw_addr);
>
> +   /* a nexthop lookup can not go through a gw.
> +* if this happens on a table based lookup
> +* then fallback to a full lookup
> +*/
> +   if (grt && grt->rt6i_flags & RTF_GATEWAY) {
> +   ip6_rt_put(grt);
> +   grt = NULL;
> +   }
> +   }
> +
> if (!grt)
> grt = rt6_lookup(net, gw_addr, NULL,
>  cfg->fc_ifindex, 1);

OK. Should the dev check be dismissed or do we add "dev && dev !=
grt->dst.dev" just as a safety net (this would be a convulated setup,
but the correct direct route could be in an ip rule with higher priority
while the one in this table is incorrect)?
-- 
"... an experienced, industrious, ambitious, and often quite often
picturesque liar."
-- Mark Twain


[PATCH] net: ipv6: fallback to full lookup if table lookup is unsuitable

2016-09-16 Thread Vincent Bernat
Commit 8c14586fc320 ("net: ipv6: Use passed in table for nexthop
lookups") introduced a regression: insertion of an IPv6 route in a table
not containing the appropriate connected route for the gateway but which
contained a non-connected route (like a default gateway) fails while it
was previously working:

$ ip link add eth0 type dummy
$ ip link set up dev eth0
$ ip addr add 2001:db8::1/64 dev eth0
$ ip route add ::/0 via 2001:db8::5 dev eth0 table 20
$ ip route add 2001:db8:cafe::1/128 via 2001:db8::6 dev eth0 table 20
RTNETLINK answers: No route to host
$ ip -6 route show table 20
default via 2001:db8::5 dev eth0  metric 1024  pref medium

After this patch, we get:

$ ip route add 2001:db8:cafe::1/128 via 2001:db8::6 dev eth0 table 20
$ ip -6 route show table 20
2001:db8:cafe::1 via 2001:db8::6 dev eth0  metric 1024  pref medium
default via 2001:db8::5 dev eth0  metric 1024  pref medium

Signed-off-by: Vincent Bernat <vinc...@bernat.im>
---
 net/ipv6/route.c | 48 +++-
 1 file changed, 27 insertions(+), 21 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ad4a7ff301fc..c2aaddcfed9e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1808,6 +1808,30 @@ static struct rt6_info *ip6_nh_lookup_table(struct net 
*net,
return rt;
 }
 
+static int ip6_nh_valid(struct rt6_info *grt,
+   struct net_device **dev, struct inet6_dev **idev) {
+   int ret = 0;
+
+   if (!grt)
+   goto out;
+   if (grt->rt6i_flags & RTF_GATEWAY)
+   goto out;
+   if (*dev) {
+   if (*dev != grt->dst.dev)
+   goto out;
+   } else {
+   *dev = grt->dst.dev;
+   *idev = grt->rt6i_idev;
+   dev_hold(*dev);
+   in6_dev_hold(*idev);
+   }
+   ret = 1;
+out:
+   if (grt)
+   ip6_rt_put(grt);
+   return ret;
+}
+
 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
 {
struct net *net = cfg->fc_nlinfo.nl_net;
@@ -1991,33 +2015,15 @@ static struct rt6_info *ip6_route_info_create(struct 
fib6_config *cfg)
if (!(gwa_type & IPV6_ADDR_UNICAST))
goto out;
 
+   err = -EHOSTUNREACH;
if (cfg->fc_table)
grt = ip6_nh_lookup_table(net, cfg, gw_addr);
-
-   if (!grt)
+   if (!ip6_nh_valid(grt, , )) {
grt = rt6_lookup(net, gw_addr, NULL,
 cfg->fc_ifindex, 1);
-
-   err = -EHOSTUNREACH;
-   if (!grt)
-   goto out;
-   if (dev) {
-   if (dev != grt->dst.dev) {
-   ip6_rt_put(grt);
+   if (!ip6_nh_valid(grt, , ))
goto out;
-   }
-   } else {
-   dev = grt->dst.dev;
-   idev = grt->rt6i_idev;
-   dev_hold(dev);
-   in6_dev_hold(grt->rt6i_idev);
}
-   if (!(grt->rt6i_flags & RTF_GATEWAY))
-   err = 0;
-   ip6_rt_put(grt);
-
-   if (err)
-   goto out;
}
err = -EINVAL;
if (!dev || (dev->flags & IFF_LOOPBACK))
-- 
2.9.3



Re: [PATCH v5 0/6] Add eBPF hooks for cgroups

2016-09-15 Thread Vincent Bernat
 ❦ 12 septembre 2016 18:12 CEST, Daniel Mack  :

> * The sample program learned to support both ingress and egress, and
>   can now optionally make the eBPF program drop packets by making it
>   return 0.

Ability to lock the eBPF program to avoid modification from a later
program or in a subcgroup would be pretty interesting from a security
perspective.
-- 
Use recursive procedures for recursively-defined data structures.
- The Elements of Programming Style (Kernighan & Plauger)


Re: [net v1] fib_rules: interface group matching

2016-09-14 Thread Vincent Bernat
 ❦ 14 septembre 2016 17:25 CEST, David Ahern  :

>> I could just give more time to VRF. I also had some concerns over
>> performance with the way Netfilter integration is done, but I understand
>> that I could just stay away from POSTROUTING rules which is the only
>> hook executed twice?

> With the changes that were committed this past weekend, the VRF code
> is now setup where I can set a flag on a per VRF basis to disable the
> extra rx and tx processing - ie., no network taps, no netfilter, no
> qdisc, etc. Drops the overhead of VRF to ~3% maybe a bit less. I need
> to think about the user api a bit more and formalize the patch. Given
> my other commitments that probably won't happen until mid-October. But
> in terms of a building block, the overhead of VRF is continuing to
> drop.

Fine by me. We can drop my patch.

Thanks!
-- 
Program defensively.
- The Elements of Programming Style (Kernighan & Plauger)


Re: [net v1] fib_rules: interface group matching

2016-09-14 Thread Vincent Bernat
 ❦ 14 septembre 2016 16:39 CEST, David Ahern  :

 When a user wants to assign a routing table to a group of incoming
 interfaces, the current solutions are:

  - one IP rule for each interface (scalability problems)
  - use of fwmark and devgroup matcher (don't work with internal route
lookups, used for example by RPF)
  - use of VRF devices (more complex)
>>>
>>> Why do you believe that? A VRF is a formalized grouping of interfaces
>>> that includes an API for locally generated traffic to specify which
>>> VRF/group to use. And, with the l3mdev rule you only need 1 rule for
>>> all VRFs regardless of the number which is the best solution to the
>>> scalability problem of adding rules per device/group/VRF.
>>>
>>> What use case are trying to solve?
>> 
>> Local processes have to be made aware of the VRF by binding to the
>> pseudo-device. Some processes may be tricked by LD_PRELOAD but some
>> won't (like stuff written in Go). Maybe I should just find a better way
>> to bind a process to a VRF without its cooperation.
>
> What API are you using for interface groups? How does an app tell the
> kernel to use interface group 1 versus group 2?

In my testbed, I have only one local application which is dnsmasq as a
DHCP server. It sends back the answer to the physical interface (with
sendmsg() and auxillary data). So it makes my argument a bit moot as the
situation is in fact worse without VRF. :-/

My testbed is here (with use of VRF, more recent commits just use plain
ip rules):

 
https://github.com/vincentbernat/network-lab/blob/d86e9ed658863ef0f51d7b853d0dc9f8b7427b21/lab-l3-hyperv/setup

I could just give more time to VRF. I also had some concerns over
performance with the way Netfilter integration is done, but I understand
that I could just stay away from POSTROUTING rules which is the only
hook executed twice?
-- 
All things that are, are with more spirit chased than enjoyed.
-- Shakespeare, "Merchant of Venice"


Re: [net v1] fib_rules: interface group matching

2016-09-14 Thread Vincent Bernat
 ❦ 14 septembre 2016 16:15 CEST, David Ahern  :

>> When a user wants to assign a routing table to a group of incoming
>> interfaces, the current solutions are:
>> 
>>  - one IP rule for each interface (scalability problems)
>>  - use of fwmark and devgroup matcher (don't work with internal route
>>lookups, used for example by RPF)
>>  - use of VRF devices (more complex)
>
> Why do you believe that? A VRF is a formalized grouping of interfaces
> that includes an API for locally generated traffic to specify which
> VRF/group to use. And, with the l3mdev rule you only need 1 rule for
> all VRFs regardless of the number which is the best solution to the
> scalability problem of adding rules per device/group/VRF.
>
> What use case are trying to solve?

Local processes have to be made aware of the VRF by binding to the
pseudo-device. Some processes may be tricked by LD_PRELOAD but some
won't (like stuff written in Go). Maybe I should just find a better way
to bind a process to a VRF without its cooperation.
-- 
Instrument your programs.  Measure before making "efficiency" changes.
- The Elements of Programming Style (Kernighan & Plauger)


Re: [net v1] fib_rules: interface group matching

2016-09-14 Thread Vincent Bernat
 ❦ 14 septembre 2016 14:40 CEST, Vincent Bernat <vinc...@bernat.im> :

> Each interface can be assigned to a numeric group using IFLA_GROUP. This
> commit enables a user to reference such a group into an IP rule. Here is
> an example of output of iproute2:
>
> $ ip rule show
> 0:  from all lookup local
> 32764:  from all iifgroup 2 lookup 2
> 32765:  from all iifgroup 1 lookup 1
> 32766:  from all lookup main
> 32767:  from all lookup default

The patch for iproute2 is available here (didn't post it inline to avoid
confuse patchwork):
 http://paste.debian.net/821247/
-- 
Say what you mean, simply and directly.
- The Elements of Programming Style (Kernighan & Plauger)


[net v1] fib_rules: interface group matching

2016-09-14 Thread Vincent Bernat
When a user wants to assign a routing table to a group of incoming
interfaces, the current solutions are:

 - one IP rule for each interface (scalability problems)
 - use of fwmark and devgroup matcher (don't work with internal route
   lookups, used for example by RPF)
 - use of VRF devices (more complex)

Each interface can be assigned to a numeric group using IFLA_GROUP. This
commit enables a user to reference such a group into an IP rule. Here is
an example of output of iproute2:

$ ip rule show
0:  from all lookup local
32764:  from all iifgroup 2 lookup 2
32765:  from all iifgroup 1 lookup 1
32766:  from all lookup main
32767:  from all lookup default

Signed-off-by: Vincent Bernat <vinc...@bernat.im>
---
 include/net/fib_rules.h|  6 -
 include/uapi/linux/fib_rules.h |  2 ++
 net/core/fib_rules.c   | 57 +-
 3 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index 456e4a6006ab..a96b186ccd02 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -28,6 +28,8 @@ struct fib_rule {
u32 pref;
int suppress_ifgroup;
int suppress_prefixlen;
+   int iifgroup;
+   int oifgroup;
chariifname[IFNAMSIZ];
charoifname[IFNAMSIZ];
struct rcu_head rcu;
@@ -92,7 +94,9 @@ struct fib_rules_ops {
[FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, \
[FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \
[FRA_GOTO]  = { .type = NLA_U32 }, \
-   [FRA_L3MDEV]= { .type = NLA_U8 }
+   [FRA_L3MDEV]= { .type = NLA_U8 }, \
+   [FRA_IIFGROUP] = { .type = NLA_U32 }, \
+   [FRA_OIFGROUP] = { .type = NLA_U32 }
 
 static inline void fib_rule_get(struct fib_rule *rule)
 {
diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h
index 14404b3ebb89..0bf5a5e94d9a 100644
--- a/include/uapi/linux/fib_rules.h
+++ b/include/uapi/linux/fib_rules.h
@@ -51,6 +51,8 @@ enum {
FRA_OIFNAME,
FRA_PAD,
FRA_L3MDEV, /* iif or oif is l3mdev goto its table */
+   FRA_IIFGROUP,   /* interface group */
+   FRA_OIFGROUP,
__FRA_MAX
 };
 
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index be4629c344a6..f8ed6ba85c72 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -37,6 +37,9 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
r->suppress_prefixlen = -1;
r->suppress_ifgroup = -1;
 
+   r->iifgroup = -1;
+   r->oifgroup = -1;
+
/* The lock is not required here, the list in unreacheable
 * at the moment this function is called */
list_add_tail(>list, >rules_list);
@@ -193,6 +196,30 @@ static int fib_rule_match(struct fib_rule *rule, struct 
fib_rules_ops *ops,
if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg))
goto out;
 
+   if (rule->iifgroup != -1) {
+   struct net_device *dev;
+
+   rcu_read_lock();
+   dev = dev_get_by_index_rcu(rule->fr_net, fl->flowi_iif);
+   if (!dev || dev->group != rule->iifgroup) {
+   rcu_read_unlock();
+   goto out;
+   }
+   rcu_read_unlock();
+   }
+
+   if (rule->oifgroup != -1) {
+   struct net_device *dev;
+
+   rcu_read_lock();
+   dev = dev_get_by_index_rcu(rule->fr_net, fl->flowi_oif);
+   if (!dev || dev->group != rule->oifgroup) {
+   rcu_read_unlock();
+   goto out;
+   }
+   rcu_read_unlock();
+   }
+
ret = ops->match(rule, fl, flags);
 out:
return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
@@ -305,6 +332,12 @@ static int rule_exists(struct fib_rules_ops *ops, struct 
fib_rule_hdr *frh,
if (r->l3mdev != rule->l3mdev)
continue;
 
+   if (r->iifgroup != rule->iifgroup)
+   continue;
+
+   if (r->oifgroup != rule->oifgroup)
+   continue;
+
if (!ops->compare(r, frh, tb))
continue;
return 1;
@@ -391,6 +424,16 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr 
*nlh)
goto errout_free;
}
 
+   if (tb[FRA_IIFGROUP])
+   rule->iifgroup = nla_get_u32(tb[FRA_IIFGROUP]);
+   else
+   rule->iifgroup = -1;
+
+   if (tb[FRA_OIFGROUP])
+   rule->oifgroup = nla_get_u32(tb[FRA_OIFGROUP]);
+   else
+   rule->oifgroup = -1;
+
rule-&

Re: [net v4] veth: advertise peer link once both links are tied together

2016-05-31 Thread Vincent Bernat
 ❦ 31 mai 2016 08:30 CEST, Vincent Bernat <vinc...@bernat.im> :

> diff --git a/drivers/net/veth.c b/drivers/net/veth.c
> index f37a6e61d4ad..aaa1b023b9f2 100644
> --- a/drivers/net/veth.c
> +++ b/drivers/net/veth.c
> @@ -466,6 +466,8 @@ static int veth_newlink(struct net *src_net, struct 
> net_device *dev,
>  
>   priv = netdev_priv(peer);
>   rcu_assign_pointer(priv->peer, dev);
> +
> + rtmsg_ifinfo(RTM_NEWLINK, peer, ~0U, GFP_KERNEL);
>   return 0;

Well, this is wrong here too because the NEWLINK message for dev was not
issued either (link state is not initialized), so we are back to square
one.

 3: veth0@NONE: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group 
default
 link/ether fa:ba:12:26:99:00 brd ff:ff:ff:ff:ff:ff
 3: veth0@veth1: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group 
default
 link/ether fa:ba:12:26:99:00 brd ff:ff:ff:ff:ff:ff
 4: veth1@veth0: <BROADCAST,MULTICAST,M-DOWN> mtu 1500 qdisc noop state 
DOWN group default
 link/ether ea:e4:e2:26:3c:87 brd ff:ff:ff:ff:ff:ff

The netlink message for veth1 is sent by rtnl_newlink() after invocation
of veth_newlink(). So, I don't see any easy way to fix that. Maybe not
worth it.
-- 
Let the data structure the program.
- The Elements of Programming Style (Kernighan & Plauger)


[net v4] veth: advertise peer link once both links are tied together

2016-05-31 Thread Vincent Bernat
When the peer link is created, its "iflink" information is not
advertised through Netlink. Once created, the local device is advertised
with this information but if a user is maintaining a cache from all
updates, it will still miss the iflink for the peer link:

2: veth0@NONE: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group 
default
link/ether ae:0e:08:af:fb:a0 brd ff:ff:ff:ff:ff:ff
3: veth1@veth0: <BROADCAST,MULTICAST,M-DOWN> mtu 1500 qdisc noop state DOWN 
group default
link/ether 3a:31:f1:36:2e:e5 brd ff:ff:ff:ff:ff:ff

With this patch, we advertise again the peer link to let any user pick
the appropriate iflink information:

3: veth0@NONE: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group 
default
link/ether fa:ba:12:26:99:00 brd ff:ff:ff:ff:ff:ff
3: veth0@veth1: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group 
default
link/ether fa:ba:12:26:99:00 brd ff:ff:ff:ff:ff:ff
4: veth1@veth0: <BROADCAST,MULTICAST,M-DOWN> mtu 1500 qdisc noop state DOWN 
group default
link/ether ea:e4:e2:26:3c:87 brd ff:ff:ff:ff:ff:ff

Signed-off-by: Vincent Bernat <vinc...@bernat.im>
---
v4:
 - use ~0U instead of IFF_SLAVE for ifi_change

v3:
 - send an additional netlink messages once the peer link is tied to
   avoid any chicken/egg problem

v2:
 - ensure the device is unregistered in case of link configuration failure

drivers/net/veth.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index f37a6e61d4ad..aaa1b023b9f2 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -466,6 +466,8 @@ static int veth_newlink(struct net *src_net, struct 
net_device *dev,
 
priv = netdev_priv(peer);
rcu_assign_pointer(priv->peer, dev);
+
+   rtmsg_ifinfo(RTM_NEWLINK, peer, ~0U, GFP_KERNEL);
return 0;
 
 err_register_dev:
-- 
2.8.1



Re: [net v3] veth: advertise peer link once both links are tied together

2016-05-31 Thread Vincent Bernat
 ❦ 30 mai 2016 18:27 CEST, Nicolas Dichtel  :

>>> +
>>> +   rtmsg_ifinfo(RTM_NEWLINK, peer, IFF_SLAVE, GFP_KERNEL);
>> 
>> Maybe ~0U would be better than hijacking IFF_SLAVE?
> IFF_SLAVE is wrong. It's a flag here, that will be put in the ifi_change field
> not an attribute number.

There are some use of IFF_SLAVE (in bonding/bond_main.c). But, I'll
update the patch nonetheless.
-- 
Use free-form input when possible.
- The Elements of Programming Style (Kernighan & Plauger)


Re: [net v3] veth: advertise peer link once both links are tied together

2016-05-30 Thread Vincent Bernat
 ❦ 30 mai 2016 17:58 CEST, Vincent Bernat <vinc...@bernat.im> :

> +
> + rtmsg_ifinfo(RTM_NEWLINK, peer, IFF_SLAVE, GFP_KERNEL);

Maybe ~0U would be better than hijacking IFF_SLAVE?
-- 
Anyone who has had a bull by the tail knows five or six more things
than someone who hasn't.
-- Mark Twain


[net v3] veth: advertise peer link once both links are tied together

2016-05-30 Thread Vincent Bernat
When the peer link is created, its "iflink" information is not
advertised through Netlink. Once created, the local device is advertised
with this information but if a user is maintaining a cache from all
updates, it will still miss the iflink for the peer link:

2: veth0@NONE: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group 
default
link/ether ae:0e:08:af:fb:a0 brd ff:ff:ff:ff:ff:ff
3: veth1@veth0: <BROADCAST,MULTICAST,M-DOWN> mtu 1500 qdisc noop state DOWN 
group default
link/ether 3a:31:f1:36:2e:e5 brd ff:ff:ff:ff:ff:ff

With this patch, we advertise again the peer link to let any user pick
the appropriate iflink information:

3: veth0@NONE: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group 
default
link/ether fa:ba:12:26:99:00 brd ff:ff:ff:ff:ff:ff
3: veth0@veth1: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group 
default
link/ether fa:ba:12:26:99:00 brd ff:ff:ff:ff:ff:ff
4: veth1@veth0: <BROADCAST,MULTICAST,M-DOWN> mtu 1500 qdisc noop state DOWN 
group default
link/ether ea:e4:e2:26:3c:87 brd ff:ff:ff:ff:ff:ff

Signed-off-by: Vincent Bernat <vinc...@bernat.im>
---
v3:
 - send an additional netlink messages once the peer link is tied to
   avoid any chicken/egg problem

v2:
 - ensure the device is unregistered in case of link configuration failure

 drivers/net/veth.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index f37a6e61d4ad..2376d17b8f53 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -466,6 +466,8 @@ static int veth_newlink(struct net *src_net, struct 
net_device *dev,
 
priv = netdev_priv(peer);
rcu_assign_pointer(priv->peer, dev);
+
+   rtmsg_ifinfo(RTM_NEWLINK, peer, IFF_SLAVE, GFP_KERNEL);
return 0;
 
 err_register_dev:
-- 
2.8.1



Re: [PATCH] veth: delay peer link configuration after interfaces are tied

2016-05-30 Thread Vincent Bernat
 ❦ 30 mai 2016 17:19 CEST, Nicolas Dichtel  :

priv = netdev_priv(peer);
rcu_assign_pointer(priv->peer, dev);
 +
 +  err = rtnl_configure_link(peer, ifmp);
 +  if (err < 0)
 +  goto err_configure_peer;
>> 
>>> You should fix the error path. 'unregister_netdevice(dev)' is missing.
>> 
>> I am sending another patch to fix that. I am quite unsure if I do the
>> right thing here.
>> 
> A less intrusive fix is to call 'rtmsg_ifinfo(RTM_NEWLINK, peer, ~0U,
> GFP_KERNEL);' a the end of veth_newlink().

I did that at first. Maybe this would make more sense to do
that. Otherwise, the first message contains an iflink value that we
cannot resolve with just the received netlink messages (since the
information is in the next netlink message). "ip monitor" seems to be
able to get the info, but I suppose it does an additional
lookup.
-- 
Make sure every module hides something.
- The Elements of Programming Style (Kernighan & Plauger)


Re: [PATCH] veth: delay peer link configuration after interfaces are tied

2016-05-30 Thread Vincent Bernat
 ❦ 30 mai 2016 12:12 CEST, Vincent Bernat <vinc...@bernat.im> :

> When the peer link is created, its "iflink" information is not
[...]

And that's the wrong patch... Please, ignore this one.
-- 
Don't stop with your first draft.
- The Elements of Programming Style (Kernighan & Plauger)


Re: [PATCH] veth: delay peer link configuration after interfaces are tied

2016-05-30 Thread Vincent Bernat
 ❦ 30 mai 2016 11:23 CEST, Nicolas Dichtel  :

>> @@ -466,6 +462,10 @@ static int veth_newlink(struct net *src_net, struct 
>> net_device *dev,
>>  
>>  priv = netdev_priv(peer);
>>  rcu_assign_pointer(priv->peer, dev);
>> +
>> +err = rtnl_configure_link(peer, ifmp);
>> +if (err < 0)
>> +goto err_configure_peer;

> You should fix the error path. 'unregister_netdevice(dev)' is missing.

I am sending another patch to fix that. I am quite unsure if I do the
right thing here.
-- 
Don't stop with your first draft.
- The Elements of Programming Style (Kernighan & Plauger)


[PATCH] veth: delay peer link configuration after interfaces are tied

2016-05-30 Thread Vincent Bernat
When the peer link is created, its "iflink" information is not
advertised through netlink. If a user is maintaining a cache from all
updates, it will miss this information:

2: veth0@NONE: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group 
default
link/ether ae:0e:08:af:fb:a0 brd ff:ff:ff:ff:ff:ff
3: veth1@veth0: <BROADCAST,MULTICAST,M-DOWN> mtu 1500 qdisc noop state DOWN 
group default
link/ether 3a:31:f1:36:2e:e5 brd ff:ff:ff:ff:ff:ff

To avoid this situation, the peer link is only configured after both
interfaces are tied together:

3: veth0@veth1: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group 
default
link/ether ee:0d:80:46:36:fe brd ff:ff:ff:ff:ff:ff
4: veth1@veth0: <BROADCAST,MULTICAST,M-DOWN> mtu 1500 qdisc noop state DOWN 
group default
link/ether ba:25:bc:7a:0d:c8 brd ff:ff:ff:ff:ff:ff

Signed-off-by: Vincent Bernat <vinc...@bernat.im>
---
 drivers/net/veth.c | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index f37a6e61d4ad..7580f6765948 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -432,10 +432,6 @@ static int veth_newlink(struct net *src_net, struct 
net_device *dev,
 
netif_carrier_off(peer);
 
-   err = rtnl_configure_link(peer, ifmp);
-   if (err < 0)
-   goto err_configure_peer;
-
/*
 * register dev last
 *
@@ -466,8 +462,17 @@ static int veth_newlink(struct net *src_net, struct 
net_device *dev,
 
priv = netdev_priv(peer);
rcu_assign_pointer(priv->peer, dev);
+
+   err = rtnl_configure_link(peer, ifmp);
+   if (err < 0)
+   goto err_configure_link;
return 0;
 
+err_configure_link:
+   RCU_INIT_POINTER(priv->peer, NULL);
+   priv = netdev_priv(dev);
+   RCU_INIT_POINTER(priv->peer, NULL);
+   unregister_netdevice(dev);
 err_register_dev:
/* nothing to do */
 err_configure_peer:
-- 
2.8.1



[PATCH] veth: delay peer link configuration after interfaces are tied

2016-05-30 Thread Vincent Bernat
When the peer link is created, its "iflink" information is not
advertised through netlink. If a user is maintaining a cache from all
updates, it will miss this information:

2: veth0@NONE: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group 
default
link/ether ae:0e:08:af:fb:a0 brd ff:ff:ff:ff:ff:ff
3: veth1@veth0: <BROADCAST,MULTICAST,M-DOWN> mtu 1500 qdisc noop state DOWN 
group default
link/ether 3a:31:f1:36:2e:e5 brd ff:ff:ff:ff:ff:ff

To avoid this situation, the peer link is only configured after both
interfaces are tied together:

3: veth0@veth1: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group 
default
link/ether ee:0d:80:46:36:fe brd ff:ff:ff:ff:ff:ff
4: veth1@veth0: <BROADCAST,MULTICAST,M-DOWN> mtu 1500 qdisc noop state DOWN 
group default
link/ether ba:25:bc:7a:0d:c8 brd ff:ff:ff:ff:ff:ff

Signed-off-by: Vincent Bernat <vinc...@bernat.im>
---
 drivers/net/veth.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index f37a6e61d4ad..9726c4dbf659 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -432,10 +432,6 @@ static int veth_newlink(struct net *src_net, struct 
net_device *dev,
 
netif_carrier_off(peer);
 
-   err = rtnl_configure_link(peer, ifmp);
-   if (err < 0)
-   goto err_configure_peer;
-
/*
 * register dev last
 *
@@ -466,6 +462,10 @@ static int veth_newlink(struct net *src_net, struct 
net_device *dev,
 
priv = netdev_priv(peer);
rcu_assign_pointer(priv->peer, dev);
+
+   err = rtnl_configure_link(peer, ifmp);
+   if (err < 0)
+   goto err_configure_peer;
return 0;
 
 err_register_dev:
-- 
2.8.1



[PATCH] veth: delay peer link configuration after interfaces are tied

2016-05-29 Thread Vincent Bernat
When the peer link is created, its "iflink" information is not
advertised through netlink. If a user is maintaining a cache from all
updates, it will miss this information:

2: veth0@NONE: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group 
default
link/ether ae:0e:08:af:fb:a0 brd ff:ff:ff:ff:ff:ff
3: veth1@veth0: <BROADCAST,MULTICAST,M-DOWN> mtu 1500 qdisc noop state DOWN 
group default
link/ether 3a:31:f1:36:2e:e5 brd ff:ff:ff:ff:ff:ff

To avoid this situation, the peer link is only configured after both
interfaces are tied together:

3: veth0@veth1: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group 
default
link/ether ee:0d:80:46:36:fe brd ff:ff:ff:ff:ff:ff
4: veth1@veth0: <BROADCAST,MULTICAST,M-DOWN> mtu 1500 qdisc noop state DOWN 
group default
link/ether ba:25:bc:7a:0d:c8 brd ff:ff:ff:ff:ff:ff

Signed-off-by: Vincent Bernat <vinc...@bernat.im>
---
 drivers/net/veth.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index f37a6e61d4ad..9726c4dbf659 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -432,10 +432,6 @@ static int veth_newlink(struct net *src_net, struct 
net_device *dev,
 
netif_carrier_off(peer);
 
-   err = rtnl_configure_link(peer, ifmp);
-   if (err < 0)
-   goto err_configure_peer;
-
/*
 * register dev last
 *
@@ -466,6 +462,10 @@ static int veth_newlink(struct net *src_net, struct 
net_device *dev,
 
priv = netdev_priv(peer);
rcu_assign_pointer(priv->peer, dev);
+
+   err = rtnl_configure_link(peer, ifmp);
+   if (err < 0)
+   goto err_configure_peer;
return 0;
 
 err_register_dev:
-- 
2.8.1



Re: [PATCH v2] socket.7: Document some BPF-related socket options

2016-03-01 Thread Vincent Bernat
 ❦  1 mars 2016 11:03 +0100, "Michael Kerrisk (man-pages)" 
 :

>   Once   the   SO_LOCK_FILTER  option  has  been  enabled,
>   attempts by an unprivileged process to change or  remove
>   the  filter  attached  to  a  socket,  or to disable the
>   SO_LOCK_FILTER option will fail with the error EPERM.

You should remove "unprivileged". I didn't try to check for permissions
because I was just lazy (and I didn't have a need for it). As root, you
can just recreate another socket.
-- 
Choose a data representation that makes the program simple.
- The Elements of Programming Style (Kernighan & Plauger)


Re: [PATCH] veth: replace iflink by a dedicated symlink in sysfs

2015-08-22 Thread Vincent Bernat
 ❦ 20 août 2015 14:07 -0700, David Miller da...@davemloft.net :

 I also don't know what is the best way to handle this. veth advertises
 its peer via IFLA_LINK since 4.1, so it's too late to change it for
 this
 release.

 Apparently we need to pick our poison. Either way, we break something.
 Sure. I would prefer to have the same mechanism in all version, but I
 can live with the other solution.
 
 David, any thoughts about this?

 You can't break the 4.1 semantics, it's in a released kernel and people
 _ARE_ using it.

I had a look at what other kind of daemons may exploit the pre-4.1
semantics (of not having an infinite loop when following iflink) and
failed to find any other users than lldpd. Other LLDP daemons (lldpad,
ladvd, openlldpd) have other ways to find the lower interface. I would
also have thought that NetSNMP would use it to implement ifStackTable
but it doesn't in fact implement this table.
-- 
It were not best that we should all think alike; it is difference of opinion
that makes horse-races.
-- Mark Twain, Pudd'nhead Wilson's Calendar
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] veth: replace iflink by a dedicated symlink in sysfs

2015-08-19 Thread Vincent Bernat
 ❦ 19 août 2015 14:38 +0200, Jiri Benc jb...@redhat.com :

 That's the main goal of this patch: advertising the peer link as
 IFLA_LINK attribute triggers an infinite loop in userland software when
 they follow iflink to discover network devices topology. iflink has
 always been the index of a lower device. If a sysfs symbolic link is not
 good enough, I can propose a new IFLA_PEER attribute instead.

 This would cause regression and break applications for those of us who
 started relying on the netnsid feature to match interfaces across net
 name spaces.

Yes. Unfortunately.

 This is tough. If you're going to do such thing, you would at least
 need to also introduce IFLA_PEER_NETNSID.

Yes I can.

In my opinion, the change of semantics of IFLA_LINK is a break of
API. However, I can live with it since it's easy to workaround it. It
just seemed easier to start the discussion with a patch.
-- 
Parenthesise to avoid ambiguity.
- The Elements of Programming Style (Kernighan  Plauger)
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Regression in 4.1 with veth and IFLA_LINK

2015-08-19 Thread Vincent Bernat
Here is a proposed patch to fix this by providing a symlink to the
peer instead. This may not totally replace the use of iflink since the
peer won't be available through netlink anymore but maybe this is good
enough for the intended use.

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] veth: replace iflink by a dedicated symlink in sysfs

2015-08-19 Thread Vincent Bernat
While the documentation doesn't say exactly what kind of relationship
iflink should represent, until a45253, only lower devices were
advertised this way. While veth cannot have a lower device, using iflink
to advertise the peer may create infinite loops in programs using iflink
to discover device topology.

Instead of advertising the peer link with iflink, a symbolic link peer
is added to each peer.

Signed-off-by: Vincent Bernat vinc...@bernat.im
---
 drivers/net/veth.c | 40 +---
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index c8186ffda1a3..47f165bc3107 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -105,6 +105,17 @@ static const struct ethtool_ops veth_ethtool_ops = {
.get_ethtool_stats  = veth_get_ethtool_stats,
 };
 
+static int veth_peer_sysfs_add(struct net_device *dev,
+ struct net_device *peer_dev)
+{
+   return sysfs_create_link((dev-dev.kobj), (peer_dev-dev.kobj),
+peer);
+}
+static void veth_peer_sysfs_del(struct net_device *dev)
+{
+   sysfs_remove_link((dev-dev.kobj), peer);
+}
+
 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
 {
struct veth_priv *priv = netdev_priv(dev);
@@ -263,20 +274,6 @@ static void veth_poll_controller(struct net_device *dev)
 }
 #endif /* CONFIG_NET_POLL_CONTROLLER */
 
-static int veth_get_iflink(const struct net_device *dev)
-{
-   struct veth_priv *priv = netdev_priv(dev);
-   struct net_device *peer;
-   int iflink;
-
-   rcu_read_lock();
-   peer = rcu_dereference(priv-peer);
-   iflink = peer ? peer-ifindex : 0;
-   rcu_read_unlock();
-
-   return iflink;
-}
-
 static const struct net_device_ops veth_netdev_ops = {
.ndo_init= veth_dev_init,
.ndo_open= veth_open,
@@ -289,7 +286,6 @@ static const struct net_device_ops veth_netdev_ops = {
 #ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller= veth_poll_controller,
 #endif
-   .ndo_get_iflink = veth_get_iflink,
 };
 
 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_ALL_TSO |\
@@ -436,6 +432,13 @@ static int veth_newlink(struct net *src_net, struct 
net_device *dev,
 
netif_carrier_off(dev);
 
+   err = veth_peer_sysfs_add(dev, peer);
+   if (err  0)
+   goto err_configure_dev;
+   err = veth_peer_sysfs_add(peer, dev);
+   if (err  0)
+   goto err_configure_dev;
+
/*
 * tie the deviced together
 */
@@ -447,9 +450,13 @@ static int veth_newlink(struct net *src_net, struct 
net_device *dev,
rcu_assign_pointer(priv-peer, dev);
return 0;
 
+err_configure_dev:
+   veth_peer_sysfs_del(dev);
+   veth_peer_sysfs_del(peer);
 err_register_dev:
/* nothing to do */
 err_configure_peer:
+   veth_peer_sysfs_del(dev);
unregister_netdevice(peer);
return err;
 
@@ -466,6 +473,9 @@ static void veth_dellink(struct net_device *dev, struct 
list_head *head)
priv = netdev_priv(dev);
peer = rtnl_dereference(priv-peer);
 
+   veth_peer_sysfs_del(dev);
+   if (peer) veth_peer_sysfs_del(dev);
+
/* Note : dellink() is called from default_device_exit_batch(),
 * before a rcu_synchronize() point. The devices are guaranteed
 * not being freed before one RCU grace period.
-- 
2.5.0

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html