Re: [next] bonding: pass link-local packets to bonding master also.
❦ 15 juillet 2018 19:12 -0700, Mahesh Bandewar : > Commit b89f04c61efe ("bonding: deliver link-local packets with > skb->dev set to link that packets arrived on") changed the behavior > of how link-local-multicast packets are processed. The change in > the behavior broke some legacy use cases where these packets are > expected to arrive on bonding master device also. Unfortunately, this doesn't completely restore the previous functionality as PACKET_ORIGDEV is broken for the copy: the original interface is lost through the call to netif_rx(). A LLDP daemon listening to the master interface won't get the original interface like it was able to before 4.12. I am a bit lost of what the original patch was trying to achieve. I am using the following test program: #v+ #!/usr/bin/env python3 import sys import socket import datetime socket.SOL_PACKET = 263 socket.ETH_P_ALL = 3 socket.PACKET_ORIGDEV = 9 interface = sys.argv[1] if len(sys.argv) > 1 else 'lag1' s = socket.socket(socket.AF_PACKET, socket.SOCK_RAW, socket.htons(socket.ETH_P_ALL)) s.bind((interface, 0)) s.setsockopt(socket.SOL_PACKET, socket.PACKET_ORIGDEV, 1) while True: data, addrinfo = s.recvfrom(1500) if addrinfo[2] == socket.PACKET_OUTGOING: continue print(f"{datetime.datetime.now().isoformat()}: " f"Received {len(data)} bytes from {addrinfo}") #v- If I run it with a kernel compiled with the commit before b89f04c61efe (plus a few more cherry-pick to make it work like ea8ffc0818d8 and 72ccc471e13b), I get: #v+ 2018-11-30T22:20:40.193378: Received 221 bytes from ('eth1', 35020, 2, 1, b'RT3\x00\x00\x02') 2018-11-30T22:20:40.194504: Received 221 bytes from ('eth0', 35020, 2, 1, b'RT3\x00\x00\x01') #v- If I send non link-local packets, I get: #v+ 2018-11-30T22:25:57.300965: Received 98 bytes from ('eth0', 2048, 0, 1, b'PT3\x00\x00\x02') #v- I am also able to correctly receive link-local packets directly on each interface. So, it seems everything was working as expected before b89f04c61efe. -- AWAKE! FEAR! FIRE! FOES! AWAKE! FEAR! FIRE! FOES! AWAKE! AWAKE! -- J. R. R. Tolkien
[PATCH net-next v1] net: don't declare IPv6 non-local bind helper if CONFIG_IPV6 undefined
Fixes: 83ba4645152d ("net: add helpers checking if socket can be bound to nonlocal address") Signed-off-by: Vincent Bernat --- include/net/ipv6.h | 14 +++--- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 82deb684ba73..ff33f498c137 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -766,13 +766,6 @@ static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6, return hlimit; } -static inline bool ipv6_can_nonlocal_bind(struct net *net, - struct inet_sock *inet) -{ - return net->ipv6.sysctl.ip_nonlocal_bind || - inet->freebind || inet->transparent; -} - /* copy IPv6 saddr & daddr to flow_keys, possibly using 64bit load/store * Equivalent to : flow->v6addrs.src = iph->saddr; * flow->v6addrs.dst = iph->daddr; @@ -789,6 +782,13 @@ static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow, #if IS_ENABLED(CONFIG_IPV6) +static inline bool ipv6_can_nonlocal_bind(struct net *net, + struct inet_sock *inet) +{ + return net->ipv6.sysctl.ip_nonlocal_bind || + inet->freebind || inet->transparent; +} + /* Sysctl settings for net ipv6.auto_flowlabels */ #define IP6_AUTO_FLOW_LABEL_OFF0 #define IP6_AUTO_FLOW_LABEL_OPTOUT 1 -- 2.18.0
[PATCH net-next v1] net: add helpers checking if socket can be bound to nonlocal address
The construction "net->ipv4.sysctl_ip_nonlocal_bind || inet->freebind || inet->transparent" is present three times and its IPv6 counterpart is also present three times. We introduce two small helpers to characterize these tests uniformly. Signed-off-by: Vincent Bernat --- include/net/inet_sock.h | 8 include/net/ipv6.h | 7 +++ net/ipv4/af_inet.c | 3 +-- net/ipv4/ping.c | 6 ++ net/ipv6/af_inet6.c | 6 ++ net/ipv6/datagram.c | 3 +-- 6 files changed, 21 insertions(+), 12 deletions(-) diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 314be484c696..e03b93360f33 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -359,4 +359,12 @@ static inline bool inet_get_convert_csum(struct sock *sk) return !!inet_sk(sk)->convert_csum; } + +static inline bool inet_can_nonlocal_bind(struct net *net, + struct inet_sock *inet) +{ + return net->ipv4.sysctl_ip_nonlocal_bind || + inet->freebind || inet->transparent; +} + #endif /* _INET_SOCK_H */ diff --git a/include/net/ipv6.h b/include/net/ipv6.h index a44509f4e985..82deb684ba73 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -766,6 +766,13 @@ static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6, return hlimit; } +static inline bool ipv6_can_nonlocal_bind(struct net *net, + struct inet_sock *inet) +{ + return net->ipv6.sysctl.ip_nonlocal_bind || + inet->freebind || inet->transparent; +} + /* copy IPv6 saddr & daddr to flow_keys, possibly using 64bit load/store * Equivalent to : flow->v6addrs.src = iph->saddr; * flow->v6addrs.dst = iph->daddr; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f2a0a3bab6b5..ee707b91d1a7 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -486,8 +486,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, * is temporarily down) */ err = -EADDRNOTAVAIL; - if (!net->ipv4.sysctl_ip_nonlocal_bind && - !(inet->freebind || inet->transparent) && + if (!inet_can_nonlocal_bind(net, inet) && addr->sin_addr.s_addr != htonl(INADDR_ANY) && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index b54c964ad925..8d7aaf118a30 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -320,8 +320,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) chk_addr_ret = RTN_LOCAL; - if ((net->ipv4.sysctl_ip_nonlocal_bind == 0 && - isk->freebind == 0 && isk->transparent == 0 && + if ((!inet_can_nonlocal_bind(net, isk) && chk_addr_ret != RTN_LOCAL) || chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) @@ -361,8 +360,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, scoped); rcu_read_unlock(); - if (!(net->ipv6.sysctl.ip_nonlocal_bind || - isk->freebind || isk->transparent || has_addr || + if (!(ipv6_can_nonlocal_bind(net, isk) || has_addr || addr_type == IPV6_ADDR_ANY)) return -EADDRNOTAVAIL; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index c9535354149f..020f6e14a7af 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -322,8 +322,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, /* Reproduce AF_INET checks to make the bindings consistent */ v4addr = addr->sin6_addr.s6_addr32[3]; chk_addr_ret = inet_addr_type(net, v4addr); - if (!net->ipv4.sysctl_ip_nonlocal_bind && - !(inet->freebind || inet->transparent) && + if (!inet_can_nonlocal_bind(net, inet) && v4addr != htonl(INADDR_ANY) && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && @@ -362,8 +361,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, */ v4addr = LOOPBACK4_IPV6; if (!(addr_type & IPV6_ADDR_MULTICAST)) { - if (!net->ipv6.sysctl.ip_nonlocal_bind && - !(inet->freebind || inet->transparent)
Re: [net-next v1] net/ipv6: allow any source address for sendmsg pktinfo with ip_nonlocal_bind
❦ 29 juillet 2018 12:28 -0700, David Miller : >> When freebind feature is set of an IPv6 socket, any source address can >> be used when sending UDP datagrams using IPv6 PKTINFO ancillary >> message. Global non-local bind feature was added in commit >> 35a256fee52c ("ipv6: Nonlocal bind") for IPv6. This commit also allows >> IPv6 source address spoofing when non-local bind feature is enabled. >> >> Signed-off-by: Vincent Bernat > > This definitely seems to make sense. And is consistent with the other > tests involving freebind and transparent. > > This test involving ip_nonlocal_bind, freeebind, and transparent happens > in several locations. Perhaps we should add a helper function for > this? Yes, I can do that. Should I also include one for SCTP? -- "Elves and Dragons!" I says to him. "Cabbages and potatoes are better for you and me." -- J. R. R. Tolkien
[net-next v1] net/ipv6: allow any source address for sendmsg pktinfo with ip_nonlocal_bind
When freebind feature is set of an IPv6 socket, any source address can be used when sending UDP datagrams using IPv6 PKTINFO ancillary message. Global non-local bind feature was added in commit 35a256fee52c ("ipv6: Nonlocal bind") for IPv6. This commit also allows IPv6 source address spoofing when non-local bind feature is enabled. Signed-off-by: Vincent Bernat --- net/ipv6/datagram.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 201306b9b5ea..c46936563b15 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -800,7 +800,8 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, if (addr_type != IPV6_ADDR_ANY) { int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL; - if (!(inet_sk(sk)->freebind || inet_sk(sk)->transparent) && + if (!(net->ipv6.sysctl.ip_nonlocal_bind || + inet_sk(sk)->freebind || inet_sk(sk)->transparent) && !ipv6_chk_addr_and_flags(net, _info->ipi6_addr, dev, !strict, 0, IFA_F_TENTATIVE) && -- 2.18.0
Re: [PATCH iproute2-next] ipaddress: fix label matching
❦ 14 juillet 2018 21:54 +0300, Serhey Popovych : > We should leave only filter.label check and return 0: > > if (filter.label) > return 0; > > This will ensure we exit from print_linkinfo() earlier, skip > print_link_stats() and push final filtering by label to > print_selected_addrinfo() and print_addrinfo(). > > And finally: this is regression and should be against iproute2, not -next. As you did the analysis, I let you do the patch. -- Watch out for off-by-one errors. - The Elements of Programming Style (Kernighan & Plauger)
Re: [PATCH iproute2-next] ipaddress: fix label matching
❦ 11 juillet 2018 21:01 -0400, David Ahern : >> +++ b/ip/ipaddress.c >> @@ -837,11 +837,6 @@ int print_linkinfo(const struct sockaddr_nl *who, >> if (!name) >> return -1; >> >> -if (filter.label && >> -(!filter.family || filter.family == AF_PACKET) && >> -fnmatch(filter.label, name, 0)) >> -return -1; >> - > > The offending commit changed the return code: > > if (filter.label && > (!filter.family || filter.family == AF_PACKET) && > - fnmatch(filter.label, RTA_DATA(tb[IFLA_IFNAME]), 0)) > - return 0; > + fnmatch(filter.label, name, 0)) > + return -1; > > > Vincent: can you try leaving the code as is, but change the return to 0? Yes, it works by just returning 0. The code still doesn't make sense. -- Many pages make a thick book, except for pocket Bibles which are on very very thin paper.
Re: [PATCH iproute2-next] ipaddress: fix label matching
❦ 11 juillet 2018 13:03 -0700, Stephen Hemminger : >> Since 9516823051ce, "ip addr show label lo:1" doesn't work >> anymore (doesn't show any address, despite a matching label). >> Reverting to return 0 instead of -1 fix the issue. >> >> However, the condition says: "if we filter by label [...] and the >> label does NOT match the interface name". This makes little sense to >> compare the label with the interface name. There is also a logic >> around filter family being provided or not. The match against the >> label is done by ifa_label_match_rta() in print_addrinfo() and >> ipaddr_filter(). >> >> Just removing the condition makes "ip addr show" works as expected >> with or without specifying a label, both when the label is matching >> and not matching. It also works if we specify a label and the label is >> the interface name. The flush operation also works as expected. >> >> Fixes: 9516823051ce ("ipaddress: Improve print_linkinfo()") >> Signed-off-by: Vincent Bernat >> --- >> ip/ipaddress.c | 5 - >> 1 file changed, 5 deletions(-) >> >> diff --git a/ip/ipaddress.c b/ip/ipaddress.c >> index 5009bfe6d2e3..20ef6724944e 100644 >> --- a/ip/ipaddress.c >> +++ b/ip/ipaddress.c >> @@ -837,11 +837,6 @@ int print_linkinfo(const struct sockaddr_nl *who, >> if (!name) >> return -1; >> >> -if (filter.label && >> -(!filter.family || filter.family == AF_PACKET) && >> -fnmatch(filter.label, name, 0)) >> -return -1; >> - >> if (tb[IFLA_GROUP]) { >> int group = rta_getattr_u32(tb[IFLA_GROUP]); >> > > If this is a regression, it should go to iproute2 not iproute2-next. > > Surprised by the solution since it is removing code that was there > before the commit you referenced in Fixes. Yes, but as I explain in the commit message, the condition does not make sense for me: why would we match the label against the interface name? This code exists since a long time. -- The lunatic, the lover, and the poet, Are of imagination all compact... -- Wm. Shakespeare, "A Midsummer Night's Dream"
[PATCH iproute2-next] ipaddress: fix label matching
Since 9516823051ce, "ip addr show label lo:1" doesn't work anymore (doesn't show any address, despite a matching label). Reverting to return 0 instead of -1 fix the issue. However, the condition says: "if we filter by label [...] and the label does NOT match the interface name". This makes little sense to compare the label with the interface name. There is also a logic around filter family being provided or not. The match against the label is done by ifa_label_match_rta() in print_addrinfo() and ipaddr_filter(). Just removing the condition makes "ip addr show" works as expected with or without specifying a label, both when the label is matching and not matching. It also works if we specify a label and the label is the interface name. The flush operation also works as expected. Fixes: 9516823051ce ("ipaddress: Improve print_linkinfo()") Signed-off-by: Vincent Bernat --- ip/ipaddress.c | 5 - 1 file changed, 5 deletions(-) diff --git a/ip/ipaddress.c b/ip/ipaddress.c index 5009bfe6d2e3..20ef6724944e 100644 --- a/ip/ipaddress.c +++ b/ip/ipaddress.c @@ -837,11 +837,6 @@ int print_linkinfo(const struct sockaddr_nl *who, if (!name) return -1; - if (filter.label && - (!filter.family || filter.family == AF_PACKET) && - fnmatch(filter.label, name, 0)) - return -1; - if (tb[IFLA_GROUP]) { int group = rta_getattr_u32(tb[IFLA_GROUP]); -- 2.18.0
[PATCH net-next v1] netfilter: provide input interface for route lookup for rpfilter
In commit 47b7e7f82802, this bit was removed at the same time the RT6_LOOKUP_F_IFACE flag was removed. However, it is needed when link-local addresses are used, which is a very common case: when packets are routed, neighbor solicitations are done using link-local addresses. For example, the following neighbor solicitation is not matched by "-m rpfilter": IP6 fe80::5254:33ff:fe00:1 > ff02::1:ff00:3: ICMP6, neighbor solicitation, who has 2001:db8::5254:33ff:fe00:3, length 32 Commit 47b7e7f82802 doesn't quite explain why we shouldn't use RT6_LOOKUP_F_IFACE in the rpfilter case. I suppose the interface check later in the function would make it redundant. However, the remaining of the routing code is using RT6_LOOKUP_F_IFACE when there is no source address (which matches rpfilter's case with a non-unicast destination, like with neighbor solicitation). Signed-off-by: Vincent Bernat <vinc...@bernat.im> Fixes: 47b7e7f82802 ("netfilter: don't set F_IFACE on ipv6 fib lookups") --- net/ipv6/netfilter/ip6t_rpfilter.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c index d12f511929f5..0fe61ede77c6 100644 --- a/net/ipv6/netfilter/ip6t_rpfilter.c +++ b/net/ipv6/netfilter/ip6t_rpfilter.c @@ -48,6 +48,8 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb, } fl6.flowi6_mark = flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; + if ((flags & XT_RPFILTER_LOOSE) == 0) + fl6.flowi6_oif = dev->ifindex; rt = (void *)ip6_route_lookup(net, , skb, lookup_flags); if (rt->dst.error) -- 2.17.0
Re: [RFC v2 bpf-next 5/9] net/ipv6: Add fib6_lookup
❦ 29 avril 2018 11:07 -0700, David Ahern: > +struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, > + int flags) Maybe an EXPORT_SYMBOL_GPL? There is one for __fib_lookup (fib_lookup is an inline function). -- Use the "telephone test" for readability. - The Elements of Programming Style (Kernighan & Plauger)
Re: [PATCH net-next v1] ipvs: add consistent source hashing scheduling
❦ 2 avril 2018 22:05 +0300, Julian Anastasov: > Sorry to say it but may be you missed the discussion > on lvs-devel about the new MH scheduler implemented by Inju Song: > > https://www.spinics.net/lists/lvs-devel/msg04928.html > http://archive.linuxvirtualserver.org/html/lvs-devel/2018-03/msg00023.html > > In the last 6 months we fixed all issues and I acked > v4 just yesterday: > > http://archive.linuxvirtualserver.org/html/lvs-devel/2018-04/msg3.html For some reason, "ipvs maglev" in Google doesn't yield those results. But having code already reviewed is great news for me: I can use it with confidence. :) -- Make sure your code "does nothing" gracefully. - The Elements of Programming Style (Kernighan & Plauger)
Re: [PATCH net-next v1] ipvs: add consistent source hashing scheduling
❦ 2 avril 2018 10:33 -0700, Eric Dumazet: >> +static inline u32 >> +ip_vs_csh_permutation(struct ip_vs_dest *d, int j) >> +{ >> +u32 offset, skip; >> +__be32 addr_fold = d->addr.ip; >> + >> +#ifdef CONFIG_IP_VS_IPV6 >> +if (d->af == AF_INET6) >> +addr_fold = d->addr.ip6[0]^d->addr.ip6[1]^ >> +d->addr.ip6[2]^d->addr.ip6[3]; >> +#endif >> +addr_fold = ntohl(addr_fold) + ntohs(d->port); >> +offset = hash_32(addr_fold, 32) % IP_VS_CSH_TAB_SIZE; >> +skip = (hash_32(addr_fold + 1, 32) % (IP_VS_CSH_TAB_SIZE - 1)) + 1; >> +return (offset + j * skip) % IP_VS_CSH_TAB_SIZE; >> +} >> + > > This does not look very strong to me, particularly the IPv6 folding > > I would rather use __ipv6_addr_jhash() instead of ipv6_addr_hash(), > even if it is hard coded ;) I can switch to ipv6_addr_hash(). However, switching to __ipv6_addr_jhash seems useless as I would need to hardcode the initial value: people use source hashing to get the same result from one host to another. Am I missing something? -- Each module should do one thing well. - The Elements of Programming Style (Kernighan & Plauger)
[PATCH net-next v1] ipvs: add consistent source hashing scheduling
Based on Google's Maglev algorithm [1][2], this scheduler builds a lookup table in a way disruption is minimized when a change occurs. This helps in case of active/active setup without synchronization. Like for classic source hashing, this lookup table is used to assign connections to a real server. Both source address and port are used to compute the hash (unlike sh where this is optional). Weights are correctly handled. Unlike sh, servers with a weight of 0 are considered as absent. Also, unlike sh, when a server becomes unavailable due to a threshold, no fallback is possible: doing so would seriously impair the the usefulness of using a consistent hash. There is a small hack to detect when all real servers have a weight of 0. It relies on the fact it is not possible for the weight of a real server to change during the execution of the assignment. I believe this is the case as modifications through netlink are subject to a mutex, but the use of atomic_read() is unsettling. The value of 65537 for the hash table size is currently not modifiable at compile-time. This is the value suggested in the Maglev paper. Another possible value is 257 (for small tests) and 655373 (for very large setups). [1]: https://research.google.com/pubs/pub44824.html [2]: https://blog.acolyer.org/2016/03/21/maglev-a-fast-and-reliable-software-network-load-balancer/ Signed-off-by: Vincent Bernat <vinc...@bernat.im> --- include/net/ip_vs.h| 27 net/netfilter/ipvs/Kconfig | 13 ++ net/netfilter/ipvs/Makefile| 1 + net/netfilter/ipvs/ip_vs_csh.c | 339 + net/netfilter/ipvs/ip_vs_sh.c | 32 +--- 5 files changed, 381 insertions(+), 31 deletions(-) create mode 100644 net/netfilter/ipvs/ip_vs_csh.c diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index eb0bec043c96..2184b43b7320 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -195,6 +195,33 @@ static inline int ip_vs_addr_equal(int af, const union nf_inet_addr *a, return a->ip == b->ip; } +static inline __be16 ip_vs_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph) +{ + __be16 _ports[2], *ports; + + /* At this point we know that we have a valid packet of some kind. +* Because ICMP packets are only guaranteed to have the first 8 +* bytes, let's just grab the ports. Fortunately they're in the +* same position for all three of the protocols we care about. +*/ + switch (iph->protocol) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_SCTP: + ports = skb_header_pointer(skb, iph->len, sizeof(_ports), + &_ports); + if (unlikely(!ports)) + return 0; + + if (likely(!ip_vs_iph_inverse(iph))) + return ports[0]; + else + return ports[1]; + default: + return 0; + } +} + #ifdef CONFIG_IP_VS_DEBUG #include diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index b32fb0dbe237..bfd091e020af 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -225,6 +225,19 @@ config IP_VS_SH If you want to compile it in kernel, say Y. To compile it as a module, choose M here. If unsure, say N. +config IP_VS_CSH + tristate "consistent source hashing scheduling" + ---help--- + The consistent source hashing scheduling algorithm assigns + network connections to the servers through looking up a + statically assigned hash table by their source IP addresses + and ports. The hash table is built to minimize disruption + when a server becomes unavailable. It relies on the Maglev + algorithm. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + config IP_VS_SED tristate "shortest expected delay scheduling" ---help--- diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile index c552993fa4b9..7d0badf86dfe 100644 --- a/net/netfilter/ipvs/Makefile +++ b/net/netfilter/ipvs/Makefile @@ -33,6 +33,7 @@ obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o +obj-$(CONFIG_IP_VS_CSH) += ip_vs_csh.o obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o diff --git a/net/netfilter/ipvs/ip_vs_csh.c b/net/netfilter/ipvs/ip_vs_csh.c new file mode 100644 index ..c70db61ba934 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_csh.c @@ -0,0 +1,339 @@ +/* + * IPVS:Consistent Hashing scheduling module using Google's Maglev + * + * Authors: Vincent Bernat <vinc...@bernat.im> + * + * This program is free software; you can redistr
[PATCH net-next v2] ipvs: fix multiplicative hashing in sh/dh/lblc/lblcr algorithms
The sh/dh/lblc/lblcr algorithms are using Knuth's multiplicative hashing incorrectly. Replace its use by the hash_32() macro, which correctly implements this algorithm. It doesn't use the same constant, but it shouldn't matter. Signed-off-by: Vincent Bernat <vinc...@bernat.im> --- net/netfilter/ipvs/ip_vs_dh.c| 3 ++- net/netfilter/ipvs/ip_vs_lblc.c | 3 ++- net/netfilter/ipvs/ip_vs_lblcr.c | 3 ++- net/netfilter/ipvs/ip_vs_sh.c| 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c index 75f798f8e83b..dfea31fb10c5 100644 --- a/net/netfilter/ipvs/ip_vs_dh.c +++ b/net/netfilter/ipvs/ip_vs_dh.c @@ -43,6 +43,7 @@ #include #include #include +#include #include @@ -81,7 +82,7 @@ static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *ad addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif - return (ntohl(addr_fold)*2654435761UL) & IP_VS_DH_TAB_MASK; + return hash_32(ntohl(addr_fold), IP_VS_DH_TAB_BITS); } diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 3057e453bf31..08147fc6400c 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -48,6 +48,7 @@ #include #include #include +#include /* for sysctl */ #include @@ -160,7 +161,7 @@ ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif - return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLC_TAB_MASK; + return hash_32(ntohl(addr_fold), IP_VS_LBLC_TAB_BITS); } diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 92adc04557ed..9b6a6c9e9cfa 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -47,6 +47,7 @@ #include #include #include +#include /* for sysctl */ #include @@ -323,7 +324,7 @@ ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif - return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLCR_TAB_MASK; + return hash_32(ntohl(addr_fold), IP_VS_LBLCR_TAB_BITS); } diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index 16aaac6eedc9..1e01c782583a 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -96,7 +96,8 @@ ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr, addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif - return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) & + return (offset + hash_32(ntohs(port) + ntohl(addr_fold), +IP_VS_SH_TAB_BITS)) & IP_VS_SH_TAB_MASK; } -- 2.16.3
Re: [PATCH net-next v1] ipvs: fix multiplicative hashing in sh/dh/lblc/lblcr algorithms
❦ 1 avril 2018 11:11 +0300, Julian Anastasov: >> -return (ntohl(addr_fold)*2654435761UL) & IP_VS_DH_TAB_MASK; >> +return ((ntohl(addr_fold)*2654435761U) >> >> +(32 - IP_VS_DH_TAB_BITS)) & >> +IP_VS_DH_TAB_MASK; > > Looks like the '& mask' part is not needed, still, > it does not generate extra code. I see that other code uses > hash_32(val, bits) from include/linux/hash.h but note that it > used different ratio before Linux 4.7, in case someone backports > this patch on old kernels. So, I don't have preference what should > be used, may be return hash_32(ntohl(addr_fold), IP_VS_DH_TAB_BITS) > is better. I didn't notice this macro. I think this is a better option. Let me amend the patch. -- Don't stop with your first draft. - The Elements of Programming Style (Kernighan & Plauger)
[PATCH net-next v1] ipvs: fix multiplicative hashing in sh/dh/lblc/lblcr algorithms
The sh/dh/lblc/lblcr algorithms are using Knuth's multiplicative hashing incorrectly. This results in uneven distribution. To fix this, the result has to be shifted by a constant. In "Lecture 21: Hash functions" [1], it is said: In the fixed-point version, The division by 2^q is crucial. The common mistake when doing multiplicative hashing is to forget to do it, and in fact you can find web pages highly ranked by Google that explain multiplicative hashing without this step. Without this division, there is little point to multiplying by a, because ka mod m = (k mod m) * (a mod m) mod m . This is no better than modular hashing with a modulus of m, and quite possibly worse. Typing the 2654435761 constant in DuckDuckGo shows many other sources to confirm this issue. Moreover, doing the multiplication in the 32bit integer space is enough, hence the change from 2654435761UL to 2654435761U. [1]: https://www.cs.cornell.edu/courses/cs3110/2008fa/lectures/lec21.html The following Python program illustrates the bug and its fix: import netaddr import collections import socket import statistics def run(buggy=False): base = netaddr.IPAddress('203.0.113.0') count = collections.defaultdict(int) for offset in range(100): for port in range(1, 11000): r = socket.ntohs(port) + socket.ntohl(int(base) + offset) r *= 2654435761 if buggy: r %= 1 << 64 else: r %= 1 << 32 r >>= 24 r &= 255 count[r] += 1 print(buggy, statistics.mean(count.values()), statistics.stdev(count.values())) run(True) run(False) Its output is: True 25000 765.9416862050705 False 390.625 4.681209831891333 Signed-off-by: Vincent Bernat <vinc...@bernat.im> --- net/netfilter/ipvs/ip_vs_dh.c| 4 +++- net/netfilter/ipvs/ip_vs_lblc.c | 4 +++- net/netfilter/ipvs/ip_vs_lblcr.c | 4 +++- net/netfilter/ipvs/ip_vs_sh.c| 3 ++- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c index 75f798f8e83b..5638e66dbdd1 100644 --- a/net/netfilter/ipvs/ip_vs_dh.c +++ b/net/netfilter/ipvs/ip_vs_dh.c @@ -81,7 +81,9 @@ static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *ad addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif - return (ntohl(addr_fold)*2654435761UL) & IP_VS_DH_TAB_MASK; + return ((ntohl(addr_fold)*2654435761U) >> + (32 - IP_VS_DH_TAB_BITS)) & + IP_VS_DH_TAB_MASK; } diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 3057e453bf31..df32022a2bc4 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -160,7 +160,9 @@ ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif - return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLC_TAB_MASK; + return ((ntohl(addr_fold)*2654435761U) >> + (32 - IP_VS_LBLC_TAB_BITS)) & + IP_VS_LBLC_TAB_MASK; } diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 92adc04557ed..3d0d278d4901 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -323,7 +323,9 @@ ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif - return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLCR_TAB_MASK; + return ((ntohl(addr_fold)*2654435761U) >> + (32 - IP_VS_LBLCR_TAB_BITS)) & + IP_VS_LBLCR_TAB_MASK; } diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index 16aaac6eedc9..d2d6cdfae86e 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -96,7 +96,8 @@ ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr, addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif - return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) & + return ((offset + (ntohs(port) + ntohl(addr_fold))*2654435761U) >> + (32 - IP_VS_SH_TAB_BITS)) & IP_VS_SH_TAB_MASK; } -- 2.16.3
Re: [PATCH iproute2-next] color: disable color when json output is requested
❦ 20 février 2018 16:04 -0800, Stephen Hemminger <step...@networkplumber.org> : >> Instead of declaring -color and -json exclusive, ignore -color when >> -json is provided. The rationale is to allow to put -color in an alias >> for ip while still being able to use -json. -color is merely a >> presentation suggestion and we can assume there is nothing to color in >> the JSON output. >> >> Signed-off-by: Vincent Bernat <vinc...@bernat.im> > > Looks fine to me, this could even go into master. > Need to update man page and make sure behavior is consistent > across ip, tc, and bridge commands. Currently, in master or net-next, only the "ip" command has a -color option. -- The human race is a race of cowards; and I am not only marching in that procession but carrying a banner. -- Mark Twain
[PATCH iproute2-next] color: disable color when json output is requested
Instead of declaring -color and -json exclusive, ignore -color when -json is provided. The rationale is to allow to put -color in an alias for ip while still being able to use -json. -color is merely a presentation suggestion and we can assume there is nothing to color in the JSON output. Signed-off-by: Vincent Bernat <vinc...@bernat.im> --- include/color.h | 1 - ip/ip.c | 7 --- lib/color.c | 8 3 files changed, 4 insertions(+), 12 deletions(-) diff --git a/include/color.h b/include/color.h index f6c351b77746..c80359d3e2e9 100644 --- a/include/color.h +++ b/include/color.h @@ -13,7 +13,6 @@ enum color_attr { }; void enable_color(void); -void check_if_color_enabled(void); void set_color_palette(void); int color_fprintf(FILE *fp, enum color_attr attr, const char *fmt, ...); enum color_attr ifa_family_color(__u8 ifa_family); diff --git a/ip/ip.c b/ip/ip.c index b15e6b66b3f6..ebf77aa2ee37 100644 --- a/ip/ip.c +++ b/ip/ip.c @@ -172,6 +172,7 @@ int main(int argc, char **argv) { char *basename; char *batch_file = NULL; + int color = 0; basename = strrchr(argv[0], '/'); if (basename == NULL) @@ -273,7 +274,7 @@ int main(int argc, char **argv) } rcvbuf = size; } else if (matches(opt, "-color") == 0) { - enable_color(); + ++color; } else if (matches(opt, "-help") == 0) { usage(); } else if (matches(opt, "-netns") == 0) { @@ -293,8 +294,8 @@ int main(int argc, char **argv) _SL_ = oneline ? "\\" : "\n"; - if (json) - check_if_color_enabled(); + if (color && !json) + enable_color(); if (batch_file) return batch(batch_file); diff --git a/lib/color.c b/lib/color.c index a13a4930b10c..da1f516cb249 100644 --- a/lib/color.c +++ b/lib/color.c @@ -92,14 +92,6 @@ void set_color_palette(void) is_dark_bg = 1; } -void check_if_color_enabled(void) -{ - if (color_is_enabled) { - fprintf(stderr, "Option \"-json\" conflicts with \"-color\".\n"); - exit(1); - } -} - int color_fprintf(FILE *fp, enum color_attr attr, const char *fmt, ...) { int ret = 0; -- 2.16.1
Re: [PATCH] net: bridge: add max_fdb_count
❦ 16 novembre 2017 20:23 +0100, Andrew Lunn: > struct net_bridge_fdb_entry is 40 bytes. > > My WiFi access point which is also a 5 port bridge, currently has 97MB > free RAM. That is space for about 2.5M FDB entries. So even Roopa's > 128K is not really a problem, in terms of memory. I am also interested in Sarah's patch because we can now have bridge with many ports through VXLAN. The FDB can be replicated to an external daemon with BGP and the cost of each additional MAC address is therefore higher than just a few bytes. It seems simpler to implement a limiting policy early (at the port or bridge level). Also, this is a pretty standard limit to have for a bridge (switchport port-security maximum on Cisco, set interface X mac-limit on Juniper). And it's not something easy to do with ebtables. -- Use the good features of a language; avoid the bad ones. - The Elements of Programming Style (Kernighan & Plauger)
Re: [PATCH net] vxlan: fix the issue that neigh proxy blocks all icmpv6 packets
❦ 11 novembre 2017 19:58 +0800, Xin Long <lucien@gmail.com> : > Commit f1fb08f6337c ("vxlan: fix ND proxy when skb doesn't have transport > header offset") removed icmp6_code and icmp6_type check before calling > neigh_reduce when doing neigh proxy. > > It means all icmpv6 packets would be blocked by this, not only ns packet. > In Jianlin's env, even ping6 couldn't work through it. > > This patch is to bring the icmp6_code and icmp6_type check back and also > removed the same check from neigh_reduce(). I am very sorry for not having spotted this bug earlier. I have tested your fix and I can confirm it works as expected. > Fixes: f1fb08f6337c ("vxlan: fix ND proxy when skb doesn't have transport > header offset") > Reported-by: Jianlin Shi <ji...@redhat.com> > Signed-off-by: Xin Long <lucien@gmail.com> Reviewed-by: Vincent Bernat <vinc...@bernat.im> -- Don't just echo the code with comments - make every comment count. - The Elements of Programming Style (Kernighan & Plauger)
Re: [net-next v3] bridge: trigger RTM_NEWLINK when interface is modified by bridge ioctl
❦ 21 septembre 2017 08:15 -0700, Stephen Hemminger <step...@networkplumber.org> : >> Currently, there is a difference in netlink events received when an >> interface is modified through bridge ioctl() or through netlink. This >> patch generates additional events when an interface is added to or >> removed from a bridge via ioctl(). >> >> When adding then removing an interface from a bridge with netlink, we >> get: >> >> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master >> bridge0 state UNKNOWN group default >> link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff >> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state >> UNKNOWN >> link/ether 9e:da:60:ee:cf:c8 >> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state >> UNKNOWN >> link/ether 9e:da:60:ee:cf:c8 >> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state >> UNKNOWN >> link/ether 9e:da:60:ee:cf:c8 >> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state >> UNKNOWN >> link/ether 9e:da:60:ee:cf:c8 >> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master >> bridge0 state UNKNOWN group default >> link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff >> >> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master >> bridge0 state UNKNOWN group default >> link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff >> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state >> UNKNOWN >> link/ether 9e:da:60:ee:cf:c8 >> Deleted 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 >> state UNKNOWN >> link/ether 9e:da:60:ee:cf:c8 >> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state >> UNKNOWN group default >> link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff >> >> When using ioctl(): >> >> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master >> bridge0 state UNKNOWN group default >> link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff >> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state >> UNKNOWN >> link/ether 9e:da:60:ee:cf:c8 >> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state >> UNKNOWN >> link/ether 9e:da:60:ee:cf:c8 >> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state >> UNKNOWN >> link/ether 9e:da:60:ee:cf:c8 >> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master >> bridge0 state UNKNOWN group default >> link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff >> >> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master >> bridge0 state UNKNOWN group default >> link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff >> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state >> UNKNOWN >> link/ether 9e:da:60:ee:cf:c8 >> Deleted 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 >> state UNKNOWN >> link/ether 9e:da:60:ee:cf:c8 >> 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state >> UNKNOWN group default >> link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff >> >> Without this patch, the last netlink notification is not sent. >> >> Signed-off-by: Vincent Bernat <vinc...@bernat.im> > > This makes sense, you should probably add a Fixes: tag to help maintainers > of long term stable kernels. > > Reviewed-by: Stephen Hemminger <step...@networkplumber.org> I wouldn't know which commit would be fixed since this is not a regression, just a behavior difference. -- Make sure special cases are truly special. - The Elements of Programming Style (Kernighan & Plauger)
Re: [PATCH net-next v2] bridge: also trigger RTM_NEWLINK when interface is released from bridge
❦ 21 septembre 2017 08:09 -0700, Roopa Prabhu: >>> The one concern is that ports added or removed through ioctl should >>> cause same events as doing the same thing via netlink. Some users use >>> brctl (ioctl) and others use newer bridge (netlink) API. >> >> I'll make a third iteration to have the same notifications when using >> ioctl() with details in the commit message. >> -- > > as long as the ioctl path for bridge port removal is generating a: > RTM_DELLINK with AF_BRIDGE and > RTM_NEWLINK with AF_UNSPEC with 'master' removed > > we should be good. These are the most important ones. > > are there other specific bridge-events missing ?. you might want to > run `bridge monitor link` also. and un-slaving of a port also triggers > fdb events which are visible under `bridge monitor fdb` With the patch, bridge monitor link generates the same event with ioctl() than with netlink (like for ip monitor link, netlink generates one extra duplicate event when enslaving). For bridge monitor fdb, there is a difference. With ioctl(), I don't get the event for VLAN1: Deleted ca:18:06:bc:f6:11 dev dummy1 vlan 1 master bridge0 permanent I suppose this is an expected difference due to the inability to manage VLAN-aware bridges with ioctl(). -- Use the fundamental control flow constructs. - The Elements of Programming Style (Kernighan & Plauger)
[net-next v3] bridge: trigger RTM_NEWLINK when interface is modified by bridge ioctl
Currently, there is a difference in netlink events received when an interface is modified through bridge ioctl() or through netlink. This patch generates additional events when an interface is added to or removed from a bridge via ioctl(). When adding then removing an interface from a bridge with netlink, we get: 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 Deleted 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN group default link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff When using ioctl(): 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue master bridge0 state UNKNOWN group default link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 Deleted 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN link/ether 9e:da:60:ee:cf:c8 5: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN group default link/ether 9e:da:60:ee:cf:c8 brd ff:ff:ff:ff:ff:ff Without this patch, the last netlink notification is not sent. Signed-off-by: Vincent Bernat <vinc...@bernat.im> --- net/bridge/br_ioctl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index 7970f8540cbb..66cd98772051 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -102,6 +102,9 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd) else ret = br_del_if(br, dev); + if (!ret) + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_MASTER, GFP_KERNEL); + return ret; } -- 2.14.1
Re: [PATCH net-next v2] bridge: also trigger RTM_NEWLINK when interface is released from bridge
❦ 20 septembre 2017 16:21 -0700, Stephen Hemminger: > The one concern is that ports added or removed through ioctl should > cause same events as doing the same thing via netlink. Some users use > brctl (ioctl) and others use newer bridge (netlink) API. I'll make a third iteration to have the same notifications when using ioctl() with details in the commit message. -- When in doubt, tell the truth. -- Mark Twain
Re: [PATCH net-next v2] bridge: also trigger RTM_NEWLINK when interface is released from bridge
❦ 20 septembre 2017 15:57 -0600, David Ahern: > The DELLINK is for AF_BRIDGE family (ifi_family). Adding family to > print_linkinfo and running the monitor I get: > > > [LINK]family 0: 35: dummy1: mtu 1500 qdisc > noqueue master br0 state UNKNOWN group default > link/ether d6:c3:73:86:3c:73 brd ff:ff:ff:ff:ff:ff > > [LINK]family 7: 35: dummy1: mtu 1500 > master br0 state UNKNOWN > link/ether d6:c3:73:86:3c:73 > > [LINK]Deleted family 7: 35: dummy1: mtu > 1500 master br0 state UNKNOWN > link/ether d6:c3:73:86:3c:73 I didn't know about the family. We can drop the patch. -- One of the most striking differences between a cat and a lie is that a cat has only nine lives. -- Mark Twain, "Pudd'nhead Wilson's Calendar"
[PATCH net-next v2] bridge: also trigger RTM_NEWLINK when interface is released from bridge
Currently, when an interface is released from a bridge via ioctl(), we get a RTM_DELLINK event through netlink: Deleted 2: dummy0: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN link/ether 6e:23:c2:54:3a:b3 Userspace has to interpret that as a removal from the bridge, not as a complete removal of the interface. When an bridged interface is completely removed, we get two events: Deleted 2: dummy0: <BROADCAST,NOARP> mtu 1500 master bridge0 state DOWN link/ether 6e:23:c2:54:3a:b3 Deleted 2: dummy0: <BROADCAST,NOARP> mtu 1500 qdisc noop state DOWN group default link/ether 6e:23:c2:54:3a:b3 brd ff:ff:ff:ff:ff:ff In constrast, when an interface is released from a bond, we get a RTM_NEWLINK with only the new characteristics (no master): 3: dummy1: <BROADCAST,NOARP,SLAVE,UP,LOWER_UP> mtu 1500 qdisc noqueue master bond0 state UNKNOWN group default link/ether ae:dc:7a:8c:9a:3c brd ff:ff:ff:ff:ff:ff 3: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN group default link/ether ae:dc:7a:8c:9a:3c brd ff:ff:ff:ff:ff:ff 4: bond0: <BROADCAST,MULTICAST,MASTER,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default link/ether ae:dc:7a:8c:9a:3c brd ff:ff:ff:ff:ff:ff 3: dummy1: <BROADCAST,NOARP> mtu 1500 qdisc noqueue state DOWN group default link/ether ae:dc:7a:8c:9a:3c brd ff:ff:ff:ff:ff:ff 3: dummy1: <BROADCAST,NOARP> mtu 1500 qdisc noqueue state DOWN group default link/ether ca:c8:7b:66:f8:25 brd ff:ff:ff:ff:ff:ff 4: bond0: <BROADCAST,MULTICAST,MASTER,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default link/ether ae:dc:7a:8c:9a:3c brd ff:ff:ff:ff:ff:ff Userland may be confused by the fact we say a link is deleted while its characteristics are only modified. A first solution would have been to turn the RTM_DELLINK event in del_nbp() into a RTM_NEWLINK event. However, maybe some piece of userland is relying on this RTM_DELLINK to detect when a bridged interface is released. Instead, we also emit a RTM_NEWLINK event once the interface is released (without master info). Deleted 2: dummy0: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN link/ether 8a:bb:e7:94:b1:f8 2: dummy0: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN group default link/ether 8a:bb:e7:94:b1:f8 brd ff:ff:ff:ff:ff:ff This is done only when using ioctl(). When using Netlink, such an event is already automatically emitted in do_setlink(). Signed-off-by: Vincent Bernat <vinc...@bernat.im> --- net/bridge/br_ioctl.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index 7970f8540cbb..3148cb3a8e82 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -99,8 +99,10 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd) if (isadd) ret = br_add_if(br, dev); - else + else { ret = br_del_if(br, dev); + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_MASTER, GFP_KERNEL); + } return ret; } -- 2.14.1
Re: [PATCH net-next v1] bridge: also trigger RTM_NEWLINK when interface is released from bridge
❦ 15 septembre 2017 21:38 +0200, Vincent Bernat <vinc...@bernat.im> : > Currently, when an interface is released from a bridge, we get a > RTM_DELLINK event through netlink: > > Deleted 2: dummy0: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 > state UNKNOWN > link/ether 6e:23:c2:54:3a:b3 It should be noted this only happens when using the ioctl API. When using the netlink API, rtnetlink.c will send a RTM_NEWLINK because of the modification. -- When in doubt, tell the truth. -- Mark Twain
[PATCH net-next v1] bridge: also trigger RTM_NEWLINK when interface is released from bridge
Currently, when an interface is released from a bridge, we get a RTM_DELLINK event through netlink: Deleted 2: dummy0: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN link/ether 6e:23:c2:54:3a:b3 Userspace has to interpret that as a removal from the bridge, not as a complete removal of the interface. When an bridged interface is completely removed, we get two events: Deleted 2: dummy0: <BROADCAST,NOARP> mtu 1500 master bridge0 state DOWN link/ether 6e:23:c2:54:3a:b3 Deleted 2: dummy0: <BROADCAST,NOARP> mtu 1500 qdisc noop state DOWN group default link/ether 6e:23:c2:54:3a:b3 brd ff:ff:ff:ff:ff:ff In constrast, when an interface is released from a bond, we get a RTM_NEWLINK with only the new characteristics (no master): 3: dummy1: <BROADCAST,NOARP,SLAVE,UP,LOWER_UP> mtu 1500 qdisc noqueue master bond0 state UNKNOWN group default link/ether ae:dc:7a:8c:9a:3c brd ff:ff:ff:ff:ff:ff 3: dummy1: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN group default link/ether ae:dc:7a:8c:9a:3c brd ff:ff:ff:ff:ff:ff 4: bond0: <BROADCAST,MULTICAST,MASTER,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default link/ether ae:dc:7a:8c:9a:3c brd ff:ff:ff:ff:ff:ff 3: dummy1: <BROADCAST,NOARP> mtu 1500 qdisc noqueue state DOWN group default link/ether ae:dc:7a:8c:9a:3c brd ff:ff:ff:ff:ff:ff 3: dummy1: <BROADCAST,NOARP> mtu 1500 qdisc noqueue state DOWN group default link/ether ca:c8:7b:66:f8:25 brd ff:ff:ff:ff:ff:ff 4: bond0: <BROADCAST,MULTICAST,MASTER,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default link/ether ae:dc:7a:8c:9a:3c brd ff:ff:ff:ff:ff:ff Userland may be confused by the fact we say a link is deleted while its characteristics are only modified. A first solution would have been to turn the RTM_DELLINK event in del_nbp() into a RTM_NEWLINK event. However, maybe some piece of userland is relying on this RTM_DELLINK to detect when a bridged interface is released. Instead, we also emit a RTM_NEWLINK event once the interface is released (without master info). Deleted 2: dummy0: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 master bridge0 state UNKNOWN link/ether 8a:bb:e7:94:b1:f8 2: dummy0: <BROADCAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN group default link/ether 8a:bb:e7:94:b1:f8 brd ff:ff:ff:ff:ff:ff Signed-off-by: Vincent Bernat <vinc...@bernat.im> --- net/bridge/br_if.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index f3aef22931ab..636e0a842f8a 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -288,6 +288,8 @@ static void del_nbp(struct net_bridge_port *p) dev->priv_flags &= ~IFF_BRIDGE_PORT; + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_MASTER, GFP_KERNEL); + netdev_rx_handler_unregister(dev); br_multicast_del_port(p); -- 2.14.1
[PATCH net-next v2] net: ipv6: avoid overhead when no custom FIB rules are installed
If the user hasn't installed any custom rules, don't go through the whole FIB rules layer. This is pretty similar to f4530fa574df (ipv4: Avoid overhead when no custom FIB rules are installed). Using a micro-benchmark module [1], timing ip6_route_output() with get_cycles(), with 40,000 routes in the main routing table, before this patch: min=606 max=12911 count=627 average=1959 95th=4903 90th=3747 50th=1602 mad=821 table=254 avgdepth=21.8 maxdepth=39 value │ ┊count 600 │▒▒▒ 199 880 │▒▒▒ 43 1160 │▒▒▒ 48 1440 │▒▒▒░░░ 43 1720 │░░░ 59 2000 │▒▒▒ 50 2280 │▒▒░░░26 2560 │▒▒░ 31 2840 │▒▒ 28 3120 │▒░░ 17 3400 │▒░░░ 17 3680 │░ 8 3960 │░░ 11 4240 │░░6 4520 │░░░ 6 4800 │░░░ 9 After: min=544 max=11687 count=627 average=1776 95th=4546 90th=3585 50th=1227 mad=565 table=254 avgdepth=21.8 maxdepth=39 value │ ┊count 540 │201 800 │▒63 1060 │▒░ 68 1320 │▒▒▒░░39 1580 │▒▒░░ 32 1840 │▒▒ 32 2100 │▒▒░░░34 2360 │▒▒░░ 33 2620 │▒▒ 26 2880 │▒░░ 22 3140 │ 9 3400 │░ 8 3660 │░ 9 3920 │░░8 4180 │░░░ 8 4440 │░░░ 8 At the frequency of the host during the bench (~ 3.7 GHz), this is about a 100 ns difference on the median value. A next step would be to collapse local and main tables, as in 0ddcf43d5d4a (ipv4: FIB Local/MAIN table collapse). [1]: https://github.com/vincentbernat/network-lab/blob/master/lab-routes-ipv6/kbench_mod.c Signed-off-by: Vincent Bernat <vinc...@bernat.im> Reviewed-by: Jiri Pirko <j...@mellanox.com> --- include/net/netns/ipv6.h | 1 + net/ipv6/fib6_rules.c| 40 +++- net/ipv6/route.c | 1 + 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index abdf3b40303b..0e50bf3ed097 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -65,6 +65,7 @@ struct netns_ipv6 { unsigned int ip6_rt_gc_expire; unsigned longip6_rt_last_gc; #ifdef CONFIG_IPV6_MULTIPLE_TABLES + bool fib6_has_custom_rules; struct rt6_info *ip6_prohibit_entry; struct rt6_info *ip6_blk_hole_entry; struct fib6_table *fib6_local_tbl; diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 2f29e4e33bd3..b240f24a6e52 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -63,19 +63,32 @@ unsigned int fib6_rules_seq_read(struct net *net) struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, int flags, pol_lookup_t lookup) { - struct fib_lookup_arg arg = { - .lookup_ptr = lookup, - .flags = FIB_LOOKUP_NOREF, - }; - - /* update flow if oif or iif point to device enslaved to l3mdev */ - l3mdev_update_flow(net, flowi6_to_flowi(fl6)); - - fib_rules_lookup(net->ipv6.fib6_rules_ops, -flowi6_to_flowi(fl6), flags, ); - - if (arg.result) - return arg.result;
Re: [net-next v1] net: ipv6: avoid overhead when no custom FIB rules are installed
❦ 8 août 2017 08:46 -0600, David Ahern: >> diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c >> index 2f29e4e33bd3..693c27ede40e 100644 >> --- a/net/ipv6/fib6_rules.c >> +++ b/net/ipv6/fib6_rules.c >> @@ -63,19 +63,32 @@ unsigned int fib6_rules_seq_read(struct net *net) >> struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, >> int flags, pol_lookup_t lookup) >> { >> -struct fib_lookup_arg arg = { >> -.lookup_ptr = lookup, >> -.flags = FIB_LOOKUP_NOREF, >> -}; >> - >> /* update flow if oif or iif point to device enslaved to l3mdev */ >> l3mdev_update_flow(net, flowi6_to_flowi(fl6)); > > The l3mdev_update_flow can be moved to the has_custom_rules block. > l3mdev requires FIB rules for the lookups to work, so no rules means no > l3mdev configured. > > Rest looks good to me. I suspected that it could be moved. I'll update the patch tomorrow. -- Choose variable names that won't be confused. - The Elements of Programming Style (Kernighan & Plauger)
[net-next v1] net: ipv6: avoid overhead when no custom FIB rules are installed
If the user hasn't installed any custom rules, don't go through the whole FIB rules layer. This is pretty similar to f4530fa574df (ipv4: Avoid overhead when no custom FIB rules are installed). Using a micro-benchmark module [1], timing ip6_route_output() using get_cycles(), on my laptop with performance governor (so, subject to some caution due to thermal throttling), with 40,000 routes in the main routing table, before this patch: min=603 max=12477 count=627 average=1926 95th=4518 90th=3661 50th=1600 mad=724 table=254 avgdepth=21.8 maxdepth=39 value │ ┊count 600 │▒▒ 134 860 │▒▒▒░░91 1120 │▒▒▒░░39 1380 │░58 1640 │░░ 57 1900 │░░ 58 2160 │▒▒▒░░░ 46 2420 │▒▒░░ 26 2680 │▒▒ 27 2940 │▒░░ 15 3200 │ 8 3460 │░ 8 3720 │░░ 12 3980 │░░9 4240 │░░░ 7 4500 │░░░ 6 After: min=486 max=12390 count=627 average=1715 95th=4365 90th=3352 50th=1324 mad=631 table=254 avgdepth=21.8 maxdepth=39 value │ ┊count 480 │153 730 │▒▒▒ 88 980 │░░░ 57 1230 │ 56 1480 │▒▒▒ 43 1730 │▒▒▒ 50 1980 │▒▒ 31 2230 │▒░░ 23 2480 │▒15 2730 │▒░ 23 2980 │▒░░░ 20 3230 │▒14 3480 │░░4 3730 │░░8 3980 │░░░ 9 4230 │░░░ 4 At the frequency of the laptop during the bench (~ 2.3 GHz), this is about a 130 ns difference on the median value and 50 ns on the minimal value. A next step would be to collapse local and main tables, as in 0ddcf43d5d4a (ipv4: FIB Local/MAIN table collapse). [1]: https://github.com/vincentbernat/network-lab/blob/master/lab-routes-ipv6/kbench_mod.c Signed-off-by: Vincent Bernat <vinc...@bernat.im> --- include/net/netns/ipv6.h | 1 + net/ipv6/fib6_rules.c| 34 -- net/ipv6/route.c | 1 + 3 files changed, 26 insertions(+), 10 deletions(-) diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index abdf3b40303b..0e50bf3ed097 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -65,6 +65,7 @@ struct netns_ipv6 { unsigned int ip6_rt_gc_expire; unsigned longip6_rt_last_gc; #ifdef CONFIG_IPV6_MULTIPLE_TABLES + bool fib6_has_custom_rules; struct rt6_info *ip6_prohibit_entry; struct rt6_info *ip6_blk_hole_entry; struct fib6_table *fib6_local_tbl; diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 2f29e4e33bd3..693c27ede40e 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -63,19 +63,32 @@ unsigned int fib6_rules_seq_read(struct net *net) struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, int flags, pol_lookup_t lookup) { - struct fib_lookup_arg arg = { - .lookup_ptr = lookup, - .flags = FIB_LOOKUP_NOREF, - }; - /* update flow if oif or iif point to device enslaved to l3mdev */ l3mdev_update_fl
[PATCH net-next v1] ip6: fix PMTU discovery when using /127 subnets
The definition of an "anycast destination address" has been tweaked as a side-effect of commit 2647a9b07032 ("ipv6: Remove external dependency on rt6i_gateway and RTF_ANYCAST"). The first address of a point-to-point /127 subnet is now considered as an anycast address. This prevents ICMPv6 errors to be returned to a sender of such a subnet and breaks PMTU discovery. This can be reproduced with: ip link add name out6 type veth peer name in6 ip link add name out7 type veth peer name in7 ip link set mtu 1400 dev out7 ip link set mtu 1400 dev in7 ip netns add next-hop ip netns add next-next-hop ip link set netns next-hop dev in6 ip link set netns next-hop dev out7 ip link set netns next-next-hop dev in7 ip link set up dev out6 ip addr add 2001:db8:1::12/127 dev out6 ip netns exec next-hop ip link set up dev in6 ip netns exec next-hop ip link set up dev out7 ip netns exec next-hop ip addr add 2001:db8:1::13/127 dev in6 ip netns exec next-hop ip addr add 2001:db8:1::14/127 dev out7 ip netns exec next-hop ip route add default via 2001:db8:1::15 ip netns exec next-hop sysctl -qw net.ipv6.conf.all.forwarding=1 ip netns exec next-next-hop ip link set up dev in7 ip netns exec next-next-hop ip addr add 2001:db8:1::15/127 dev in7 ip netns exec next-next-hop ip addr add 2001:db8:1::50/128 dev in7 ip netns exec next-next-hop ip route add default via 2001:db8:1::14 ip netns exec next-next-hop sysctl -qw net.ipv6.conf.all.forwarding=1 ip route add 2001:db8:1::48/123 via 2001:db8:1::13 sleep 4 ping -M do -s 1452 -c 3 2001:db8:1::50 || true ip route get 2001:db8:1::50 Before the patch, we get: 2001:db8:1::50 from :: via 2001:db8:1::13 dev out6 src 2001:db8:1::12 metric 1024 pref medium After the patch, we get: 2001:db8:1::50 via 2001:db8:1::13 dev out6 src 2001:db8:1::12 metric 0 cache expires 578sec mtu 1400 pref medium Fixes: 2647a9b07032 ("ipv6: Remove external dependency on rt6i_gateway and RTF_ANYCAST") Signed-off-by: Vincent Bernat <vinc...@bernat.im> --- include/net/ip6_route.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 199056933dcb..907d39a42f6b 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -194,7 +194,7 @@ static inline bool ipv6_anycast_destination(const struct dst_entry *dst, struct rt6_info *rt = (struct rt6_info *)dst; return rt->rt6i_flags & RTF_ANYCAST || - (rt->rt6i_dst.plen != 128 && + (rt->rt6i_dst.plen < 127 && ipv6_addr_equal(>rt6i_dst.addr, daddr)); } -- 2.13.2
[net-next] net: remove policy-routing.txt documentation
It dates back from 2.1.16 and is obsolete since 2.1.68 when the current rule system has been introduced. Signed-off-by: Vincent Bernat <vinc...@bernat.im> --- Documentation/networking/policy-routing.txt | 150 1 file changed, 150 deletions(-) delete mode 100644 Documentation/networking/policy-routing.txt diff --git a/Documentation/networking/policy-routing.txt b/Documentation/networking/policy-routing.txt deleted file mode 100644 index 36f6936d7f21.. --- a/Documentation/networking/policy-routing.txt +++ /dev/null @@ -1,150 +0,0 @@ -Classes - - "Class" is a complete routing table in common sense. - I.e. it is tree of nodes (destination prefix, tos, metric) - with attached information: gateway, device etc. - This tree is looked up as specified in RFC1812 5.2.4.3 - 1. Basic match - 2. Longest match - 3. Weak TOS. - 4. Metric. (should not be in kernel space, but they are) - 5. Additional pruning rules. (not in kernel space). - - We have two special type of nodes: - REJECT - abort route lookup and return an error value. - THROW - abort route lookup in this class. - - - Currently the number of classes is limited to 255 - (0 is reserved for "not specified class") - - Three classes are builtin: - - RT_CLASS_LOCAL=255 - local interface addresses, - broadcasts, nat addresses. - - RT_CLASS_MAIN=254 - all normal routes are put there - by default. - - RT_CLASS_DEFAULT=253 - if ip_fib_model==1, then - normal default routes are put there, if ip_fib_model==2 - all gateway routes are put there. - - -Rules -- - Rule is a record of (src prefix, src interface, tos, dst prefix) - with attached information. - - Rule types: - RTP_ROUTE - lookup in attached class - RTP_NAT - lookup in attached class and if a match is found, - translate packet source address. - RTP_MASQUERADE - lookup in attached class and if a match is found, - masquerade packet as sourced by us. - RTP_DROP - silently drop the packet. - RTP_REJECT - drop the packet and send ICMP NET UNREACHABLE. - RTP_PROHIBIT - drop the packet and send ICMP COMM. ADM. PROHIBITED. - - Rule flags: - RTRF_LOG - log route creations. - RTRF_VALVE - One way route (used with masquerading) - -Default setup: - -root@amber:/pub/ip-routing # iproute -r -Kernel routing policy rules -Pref Source DestinationTOS Iface Cl - 0 defaultdefault00 * 255 - 254 defaultdefault00 * 254 - 255 defaultdefault00 * 253 - - -Lookup algorithm - - - We scan rules list, and if a rule is matched, apply it. - If a route is found, return it. - If it is not found or a THROW node was matched, continue - to scan rules. - -Applications - - -1. Just ignore classes. All the routes are put into MAIN class - (and/or into DEFAULT class). - - HOWTO: iproute add PREFIX [ tos TOS ] [ gw GW ] [ dev DEV ] - [ metric METRIC ] [ reject ] ... (look at iproute utility) - - or use route utility from current net-tools. - -2. Opposite case. Just forget all that you know about routing - tables. Every rule is supplied with its own gateway, device - info. record. This approach is not appropriate for automated - route maintenance, but it is ideal for manual configuration. - - HOWTO: iproute addrule [ from PREFIX ] [ to PREFIX ] [ tos TOS ] - [ dev INPUTDEV] [ pref PREFERENCE ] route [ gw GATEWAY ] - [ dev OUTDEV ] . - - Warning: As of now the size of the routing table in this - approach is limited to 256. If someone likes this model, I'll - relax this limitation. - -3. OSPF classes (see RFC1583, RFC1812 E.3.3) - Very clean, stable and robust algorithm for OSPF routing - domains. Unfortunately, it is not widely used in the Internet. - - Proposed setup: - 255 local addresses - 254 interface routes - 253 ASE routes with external metric - 252 ASE routes with internal metric - 251 inter-area routes - 250 intra-area routes for 1st area - 249 intra-area routes for 2nd area - etc. - - Rules: - iproute addrule class 253 - iproute addrule class 252 - iproute addrule class 251 - iproute addrule to a-prefix-for-1st-area class 250 - iproute addrule to another-prefix-for-1st-area class 250 - ... - iproute addrule to a-prefix-for-2nd-area class 249 - ... - - Area classes must be terminated with reject record. - iproute add default reject class 250 - iproute ad
[PATCH net-next v4] vxlan: fix ND proxy when skb doesn't have transport header offset
When an incoming frame is tagged or when GRO is disabled, the skb handled to vxlan_xmit() doesn't contain a valid transport header offset. This makes ND proxying fail. We combine two changes: replace use of skb_transport_offset() and ensure the necessary amount of skb is linear just before using it: - In vxlan_xmit(), when determining if we have an ICMPv6 neighbor discovery packet, just check if it is an ICMPv6 packet and rely on neigh_reduce() to do more checks if this is the case. The use of pskb_may_pull() is replaced by skb_header_pointer() for just the IPv6 header. - In neigh_reduce(), add pskb_may_pull() for IPv6 header and neighbor discovery message since this was removed from vxlan_xmit(). Replace skb_transport_header() with ipv6_hdr() + 1. - In vxlan_na_create(), replace first skb_transport_offset() with ipv6_hdr() + 1 and second with skb_network_offset() + sizeof(struct ipv6hdr). Additionally, ensure we pskb_may_pull() the whole skb as we need it to iterate over the options. Signed-off-by: Vincent Bernat <vinc...@bernat.im> --- drivers/net/vxlan.c | 29 +++-- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 1e54fb5c883a..89cb8267 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1515,7 +1515,7 @@ static struct sk_buff *vxlan_na_create(struct sk_buff *request, int ns_olen; int i, len; - if (dev == NULL) + if (dev == NULL || !pskb_may_pull(request, request->len)) return NULL; len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) + @@ -1530,10 +1530,11 @@ static struct sk_buff *vxlan_na_create(struct sk_buff *request, skb_push(reply, sizeof(struct ethhdr)); skb_reset_mac_header(reply); - ns = (struct nd_msg *)skb_transport_header(request); + ns = (struct nd_msg *)(ipv6_hdr(request) + 1); daddr = eth_hdr(request)->h_source; - ns_olen = request->len - skb_transport_offset(request) - sizeof(*ns); + ns_olen = request->len - skb_network_offset(request) - + sizeof(struct ipv6hdr) - sizeof(*ns); for (i = 0; i < ns_olen-1; i += (ns->opt[i+1]<<3)) { if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) { daddr = ns->opt + i + sizeof(struct nd_opt_hdr); @@ -1604,10 +1605,13 @@ static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) if (!in6_dev) goto out; + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + sizeof(struct nd_msg))) + goto out; + iphdr = ipv6_hdr(skb); daddr = >daddr; - msg = (struct nd_msg *)skb_transport_header(skb); + msg = (struct nd_msg *)(iphdr + 1); if (msg->icmph.icmp6_code != 0 || msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION) goto out; @@ -2242,16 +2246,13 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) if (ntohs(eth->h_proto) == ETH_P_ARP) return arp_reduce(dev, skb, vni); #if IS_ENABLED(CONFIG_IPV6) - else if (ntohs(eth->h_proto) == ETH_P_IPV6 && -pskb_may_pull(skb, sizeof(struct ipv6hdr) - + sizeof(struct nd_msg)) && -ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) { - struct nd_msg *msg; - - msg = (struct nd_msg *)skb_transport_header(skb); - if (msg->icmph.icmp6_code == 0 && - msg->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) - return neigh_reduce(dev, skb, vni); + else if (ntohs(eth->h_proto) == ETH_P_IPV6) { + struct ipv6hdr *hdr, _hdr; + if ((hdr = skb_header_pointer(skb, + skb_network_offset(skb), + sizeof(_hdr), &_hdr)) && + hdr->nexthdr == IPPROTO_ICMPV6) + return neigh_reduce(dev, skb, vni); } #endif } -- 2.11.0
[net-next v3] vxlan: fix ND proxy when skb doesn't have transport header offset
When an incoming frame is tagged or when GRO is disabled, the skb handled to vxlan_xmit() doesn't contain a valid transport header offset. This makes ND proxying fail. We combine two changes: replace use of skb_transport_offset() and ensure the necessary amount of skb is linear just before using it: - In vxlan_xmit(), when determining if we have an ICMPv6 neighbor discovery packet, just check if it is an ICMPv6 packet and rely on neigh_reduce() to do more checks if this is the case. The use of pskb_may_pull() is replaced by skb_header_pointer() for just the IPv6 header. - In neigh_reduce(), add pskb_may_pull() for IPv6 header and neighbor discovery message since this was removed from vxlan_xmit(). Replace skb_transport_header() with ipv6_hdr() + 1. - In vxlan_na_create(), replace first skb_transport_offset() with ipv6_hdr() + 1 and second with skb_network_offset() + sizeof(struct ipv6hdr). Additionally, ensure we pskb_may_pull() the whole skb as we need it to iterate over the options. Signed-off-by: Vincent Bernat <vinc...@bernat.im> --- drivers/net/vxlan.c | 28 ++-- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 1e54fb5c883a..54dda367de2b 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1515,7 +1515,7 @@ static struct sk_buff *vxlan_na_create(struct sk_buff *request, int ns_olen; int i, len; - if (dev == NULL) + if (dev == NULL || !pskb_may_pull(request, request->len)) return NULL; len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) + @@ -1530,10 +1530,11 @@ static struct sk_buff *vxlan_na_create(struct sk_buff *request, skb_push(reply, sizeof(struct ethhdr)); skb_reset_mac_header(reply); - ns = (struct nd_msg *)skb_transport_header(request); + ns = (struct nd_msg *)(ipv6_hdr(request) + 1); daddr = eth_hdr(request)->h_source; - ns_olen = request->len - skb_transport_offset(request) - sizeof(*ns); + ns_olen = request->len - skb_network_offset(request) - + sizeof(struct ipv6hdr) - sizeof(*ns); for (i = 0; i < ns_olen-1; i += (ns->opt[i+1]<<3)) { if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) { daddr = ns->opt + i + sizeof(struct nd_opt_hdr); @@ -1604,10 +1605,13 @@ static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) if (!in6_dev) goto out; + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + sizeof(struct nd_msg))) + goto out; + iphdr = ipv6_hdr(skb); daddr = >daddr; - msg = (struct nd_msg *)skb_transport_header(skb); + msg = (struct nd_msg *)(iphdr + 1); if (msg->icmph.icmp6_code != 0 || msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION) goto out; @@ -2238,21 +2242,17 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) } if (vxlan->flags & VXLAN_F_PROXY) { + struct ipv6hdr *hdr, _hdr; eth = eth_hdr(skb); if (ntohs(eth->h_proto) == ETH_P_ARP) return arp_reduce(dev, skb, vni); #if IS_ENABLED(CONFIG_IPV6) else if (ntohs(eth->h_proto) == ETH_P_IPV6 && -pskb_may_pull(skb, sizeof(struct ipv6hdr) - + sizeof(struct nd_msg)) && -ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) { - struct nd_msg *msg; - - msg = (struct nd_msg *)skb_transport_header(skb); - if (msg->icmph.icmp6_code == 0 && - msg->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) - return neigh_reduce(dev, skb, vni); - } +(hdr = skb_header_pointer(skb, + skb_network_offset(skb), + sizeof(_hdr), &_hdr)) && +hdr->nexthdr == IPPROTO_ICMPV6) + return neigh_reduce(dev, skb, vni); #endif } -- 2.11.0
Re: [net-next v2] vxlan: fix ND proxy when skb doesn't have transport header offset
❦ 30 mars 2017 06:36 -0700, Eric Dumazet: >>> Parsing of neighbor discovery options is done earlier to ignore the >>> whole packet in case of a malformed option. Moreover, the assumption the >>> skb was linear is removed and options are extracted with >>> skb_header_pointer() as well. The check on the source link-layer address >>> option is also more strict (for Ethernet, we expect the length field to >>> be 1). >> >> There is some parsing implemented in net/ipv6/ndisc.c, notably >> ndisc_parse_options(). I don't know if this is a good idea to reuse >> that: it may have the expectation that some IP processing has already >> been done (for example, the IPv6 length has already been checked, the >> SKB is expected to be linear). > > Forcing ICMP being linear is probably fine. > > Prior to 91269e390d062b52 ("vxlan: using pskb_may_pull as early as possible") > this was happening (at the wrong place) in neigh_reduce() doing a : > if (!pskb_may_pull(skb, skb->len)) > goto out; OK, I'll simplify the patch then. > So the following would be ok (while incomplete of course) > > diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c > index > bdb6ae16d4a85bf9539199e189011bce104ba51a..cd032819d4a36d5ca94739f20f947f3f5a31aba3 > 100644 > --- a/drivers/net/vxlan.c > +++ b/drivers/net/vxlan.c > @@ -2243,12 +2243,13 @@ static netdev_tx_t vxlan_xmit(struct sk_buff > *skb, struct net_device *dev) > return arp_reduce(dev, skb, vni); > #if IS_ENABLED(CONFIG_IPV6) > else if (ntohs(eth->h_proto) == ETH_P_IPV6 && > -pskb_may_pull(skb, sizeof(struct ipv6hdr) > - + sizeof(struct nd_msg)) && > +skb->len >= sizeof(struct ipv6hdr) + > +sizeof(struct nd_msg) && > +pskb_may_pull(skb, skb->len) && > ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) { > struct nd_msg *msg; > > - msg = (struct nd_msg > *)skb_transport_header(skb); > + msg = (struct nd_msg *)(ipv6_hdr(skb) + 1); > if (msg->icmph.icmp6_code == 0 && > msg->icmph.icmp6_type == > NDISC_NEIGHBOUR_SOLICITATION) > return neigh_reduce(dev, skb, vni); pskb_may_pull() is called while we only know this is an IPv6 packet, not an ICMPv6 one. I'll keep skb_header_pointer to handle IPv6 header, then I'll pull the whole ICMP packet (unless I am mistaken, of course). -- The devil can cite Scripture for his purpose. -- William Shakespeare, "The Merchant of Venice"
Re: [net-next v2] vxlan: fix ND proxy when skb doesn't have transport header offset
❦ 29 mars 2017 22:47 +0200, Vincent Bernat <vinc...@bernat.im> : > Parsing of neighbor discovery options is done earlier to ignore the > whole packet in case of a malformed option. Moreover, the assumption the > skb was linear is removed and options are extracted with > skb_header_pointer() as well. The check on the source link-layer address > option is also more strict (for Ethernet, we expect the length field to > be 1). There is some parsing implemented in net/ipv6/ndisc.c, notably ndisc_parse_options(). I don't know if this is a good idea to reuse that: it may have the expectation that some IP processing has already been done (for example, the IPv6 length has already been checked, the SKB is expected to be linear). -- Watch out for off-by-one errors. - The Elements of Programming Style (Kernighan & Plauger)
[net-next v2] vxlan: fix ND proxy when skb doesn't have transport header offset
When an incoming frame is tagged or when GRO is disabled, the skb handled to vxlan_xmit() doesn't contain a valid transport header offset. This makes ND proxying fail. Do not rely on skb_transport_offset(). Instead, use offsets from skb_network_offset() with skb_header_pointer() to extract appropriate information to detect an ICMPv6 neighbor solicitation and to extract the information needed to build the answer. Duplicate checks at the beginning of neigh_reduce() are removed. Parsing of neighbor discovery options is done earlier to ignore the whole packet in case of a malformed option. Moreover, the assumption the skb was linear is removed and options are extracted with skb_header_pointer() as well. The check on the source link-layer address option is also more strict (for Ethernet, we expect the length field to be 1). After this change, the vxlan module is correctly able to answer to VLAN-encapsulated frames: 22:02:46.332573 50:54:33:00:00:02 > 33:33:00:00:00:02, ethertype 802.1Q (0x8100), length 74: vlan 100, p 0,ethertype IPv6, (hlim 255, next-header ICMPv6 (58) payload length: 16) fe80::5254:33ff:fe00:2 > ff02::2: [icmp6 sum ok] ICMP6, router solicitation, length 16 source link-address option (1), length 8 (1): 50:54:33:00:00:02 0x: 5054 3300 0002 22:02:47.846631 50:54:33:00:00:08 > 33:33:00:00:00:02, ethertype 802.1Q (0x8100), length 74: vlan 100, p 0,ethertype IPv6, (hlim 255, next-header ICMPv6 (58) payload length: 16) fe80::5254:33ff:fe00:8 > ff02::2: [icmp6 sum ok] ICMP6, router solicitation, length 16 source link-address option (1), length 8 (1): 50:54:33:00:00:08 0x: 5054 3300 0008 Signed-off-by: Vincent Bernat <vinc...@bernat.im> --- drivers/net/vxlan.c | 85 ++--- 1 file changed, 48 insertions(+), 37 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 1e54fb5c883a..f40272785aa2 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1504,20 +1504,43 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) #if IS_ENABLED(CONFIG_IPV6) static struct sk_buff *vxlan_na_create(struct sk_buff *request, - struct neighbour *n, bool isrouter) + struct ipv6hdr *ip6h, struct nd_msg *ns, + struct neighbour *n, bool isrouter) { struct net_device *dev = request->dev; struct sk_buff *reply; - struct nd_msg *ns, *na; + struct nd_msg *na; struct ipv6hdr *pip6; - u8 *daddr; + u8 *daddr, _daddr[ETH_ALEN]; int na_olen = 8; /* opt hdr + ETH_ALEN for target */ - int ns_olen; - int i, len; + int len, remaining, offset; if (dev == NULL) return NULL; + /* Destination address is the "source link-layer address" +* option if present and valid or the source Ethernet +* address */ + daddr = eth_hdr(request)->h_source; + remaining = htons(ip6h->payload_len) - sizeof(*ns); + offset = skb_network_offset(request) + sizeof(*ip6h) + sizeof(*ns); + while (remaining > sizeof(struct nd_opt_hdr)) { + struct nd_opt_hdr *ohdr, _ohdr; + ohdr = skb_header_pointer(request, offset, sizeof(_ohdr), &_ohdr); + if (!ohdr || !(len = ohdr->nd_opt_len<<3) || len > remaining) + return NULL; + if (ohdr->nd_opt_type == ND_OPT_SOURCE_LL_ADDR && + len == na_olen) { + daddr = skb_header_pointer(request, offset + sizeof(_ohdr), + sizeof(_daddr), _daddr); + if (!daddr) + return NULL; + break; + } + remaining -= len; + offset += len; + } + len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) + sizeof(*na) + na_olen + dev->needed_tailroom; reply = alloc_skb(len, GFP_ATOMIC); @@ -1530,16 +1553,6 @@ static struct sk_buff *vxlan_na_create(struct sk_buff *request, skb_push(reply, sizeof(struct ethhdr)); skb_reset_mac_header(reply); - ns = (struct nd_msg *)skb_transport_header(request); - - daddr = eth_hdr(request)->h_source; - ns_olen = request->len - skb_transport_offset(request) - sizeof(*ns); - for (i = 0; i < ns_olen-1; i += (ns->opt[i+1]<<3)) { - if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) { - daddr = ns->opt + i + sizeof(struct nd_opt_hdr); - break; - } - } /* Ethernet header */ ether_addr_copy(eth_hdr(reply)->h_dest, daddr); @@ -1556,10 +1569,10 @@ static struct sk_buff *vxlan_na_create(struct sk_buff *reque
[net-next v1] vxlan: use appropriate family on L3 miss
When sending a L3 miss, the family is set to AF_INET even for IPv6. This causes userland (eg "ip monitor") to be confused. Ensure we send the appropriate family in this case. For L2 miss, keep using AF_INET. Signed-off-by: Vincent Bernat <vinc...@bernat.im> --- drivers/net/vxlan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index e375560cc74e..168257aa8ace 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -276,9 +276,9 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, send_eth = send_ip = true; if (type == RTM_GETNEIGH) { - ndm->ndm_family = AF_INET; send_ip = !vxlan_addr_any(>remote_ip); send_eth = !is_zero_ether_addr(fdb->eth_addr); + ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : AF_INET; } else ndm->ndm_family = AF_BRIDGE; ndm->ndm_state = fdb->state; -- 2.11.0
[PATCH] vxlan: use preferred address family when neither group or remote is specified
When neither group or remote is specified (or if they are specified with the any address), nothing is sent to the kernel. In this case, the kernel defaults to IPv4. This makes impossible to use IPv6 with unspecified unicast remote ("bridge fdb add" will return EAFNOTSUPPORT). If the user specifies a preferred address family (eg, "ip -6 link add"), then send either IFLA_VXLAN_GROUP or IFLA_VXLAN_GROUP6 to enforce the use of the appropriate family. Signed-off-by: Vincent Bernat <vinc...@bernat.im> --- ip/iplink_vxlan.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ip/iplink_vxlan.c b/ip/iplink_vxlan.c index 6d02bb47b2f0..fef7d3af4990 100644 --- a/ip/iplink_vxlan.c +++ b/ip/iplink_vxlan.c @@ -286,10 +286,14 @@ static int vxlan_parse_opt(struct link_util *lu, int argc, char **argv, addattr_l(n, 1024, IFLA_VXLAN_GROUP, , 4); else if (daddr) addattr_l(n, 1024, IFLA_VXLAN_GROUP, , 4); - if (!IN6_IS_ADDR_UNSPECIFIED()) + else if (!IN6_IS_ADDR_UNSPECIFIED()) addattr_l(n, 1024, IFLA_VXLAN_GROUP6, , sizeof(struct in6_addr)); else if (!IN6_IS_ADDR_UNSPECIFIED()) addattr_l(n, 1024, IFLA_VXLAN_GROUP6, , sizeof(struct in6_addr)); + else if (preferred_family == AF_INET) + addattr_l(n, 1024, IFLA_VXLAN_GROUP, , 4); + else if (preferred_family == AF_INET6) + addattr_l(n, 1024, IFLA_VXLAN_GROUP6, , sizeof(struct in6_addr)); if (saddr) addattr_l(n, 1024, IFLA_VXLAN_LOCAL, , 4); -- 2.11.0
Re: [PATCH] net: ethtool: don't require CAP_NET_ADMIN for ETHTOOL_GLINKSETTINGS
❦ 24 novembre 2016 10:55 +0100, Miroslav Lichvar: > The ETHTOOL_GLINKSETTINGS command is deprecating the ETHTOOL_GSET > command and likewise it shouldn't require the CAP_NET_ADMIN > capability. Could this patch be pushed to stable branches too? -- Each module should do one thing well. - The Elements of Programming Style (Kernighan & Plauger)
Re: [PATCH] net: ipv6: fallback to full lookup if table lookup is unsuitable
❦ 19 septembre 2016 06:58 CEST, David Miller: >> @@ -1808,6 +1808,30 @@ static struct rt6_info *ip6_nh_lookup_table(struct >> net *net, >> return rt; >> } >> >> +static int ip6_nh_valid(struct rt6_info *grt, >> +struct net_device **dev, struct inet6_dev **idev) { >> +int ret = 0; > > First, this is not formatted properly. The openning brace should start > on a new line. > > Second, please use "bool", "true", and "false" for the return value. Noted for the next time. However, the v3 version of the patch doesn't have the function anymore. -- Avoid temporary variables. - The Elements of Programming Style (Kernighan & Plauger)
[v3] net: ipv6: fallback to full lookup if table lookup is unsuitable
Commit 8c14586fc320 ("net: ipv6: Use passed in table for nexthop lookups") introduced a regression: insertion of an IPv6 route in a table not containing the appropriate connected route for the gateway but which contained a non-connected route (like a default gateway) fails while it was previously working: $ ip link add eth0 type dummy $ ip link set up dev eth0 $ ip addr add 2001:db8::1/64 dev eth0 $ ip route add ::/0 via 2001:db8::5 dev eth0 table 20 $ ip route add 2001:db8:cafe::1/128 via 2001:db8::6 dev eth0 table 20 RTNETLINK answers: No route to host $ ip -6 route show table 20 default via 2001:db8::5 dev eth0 metric 1024 pref medium After this patch, we get: $ ip route add 2001:db8:cafe::1/128 via 2001:db8::6 dev eth0 table 20 $ ip -6 route show table 20 2001:db8:cafe::1 via 2001:db8::6 dev eth0 metric 1024 pref medium default via 2001:db8::5 dev eth0 metric 1024 pref medium Fixes: 8c14586fc320 ("net: ipv6: Use passed in table for nexthop lookups") Signed-off-by: Vincent Bernat <vinc...@bernat.im> --- net/ipv6/route.c | 11 ++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ad4a7ff301fc..ec33c6d7eed5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1991,9 +1991,18 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) if (!(gwa_type & IPV6_ADDR_UNICAST)) goto out; - if (cfg->fc_table) + if (cfg->fc_table) { grt = ip6_nh_lookup_table(net, cfg, gw_addr); + if (grt) { + if (grt->rt6i_flags & RTF_GATEWAY || + (dev && dev != grt->dst.dev)) { + ip6_rt_put(grt); + grt = NULL; + } + } + } + if (!grt) grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1); -- 2.9.3
[v2] net: ipv6: fallback to full lookup if table lookup is unsuitable
Commit 8c14586fc320 ("net: ipv6: Use passed in table for nexthop lookups") introduced a regression: insertion of an IPv6 route in a table not containing the appropriate connected route for the gateway but which contained a non-connected route (like a default gateway) fails while it was previously working: $ ip link add eth0 type dummy $ ip link set up dev eth0 $ ip addr add 2001:db8::1/64 dev eth0 $ ip route add ::/0 via 2001:db8::5 dev eth0 table 20 $ ip route add 2001:db8:cafe::1/128 via 2001:db8::6 dev eth0 table 20 RTNETLINK answers: No route to host $ ip -6 route show table 20 default via 2001:db8::5 dev eth0 metric 1024 pref medium After this patch, we get: $ ip route add 2001:db8:cafe::1/128 via 2001:db8::6 dev eth0 table 20 $ ip -6 route show table 20 2001:db8:cafe::1 via 2001:db8::6 dev eth0 metric 1024 pref medium default via 2001:db8::5 dev eth0 metric 1024 pref medium Signed-off-by: Vincent Bernat <vinc...@bernat.im> --- net/ipv6/route.c | 8 1 file changed, 8 insertions(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ad4a7ff301fc..2c6c7257ff75 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1994,6 +1994,14 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) if (cfg->fc_table) grt = ip6_nh_lookup_table(net, cfg, gw_addr); + if (grt) { + if (grt->rt6i_flags & RTF_GATEWAY || + (dev && dev != grt->dst.dev)) { + ip6_rt_put(grt); + grt = NULL; + } + } + if (!grt) grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1); -- 2.9.3
Re: [PATCH] net: ipv6: fallback to full lookup if table lookup is unsuitable
❦ 16 septembre 2016 20:36 CEST, David Ahern: >> contained a non-connected route (like a default gateway) fails while it >> was previously working: >> >> $ ip link add eth0 type dummy >> $ ip link set up dev eth0 >> $ ip addr add 2001:db8::1/64 dev eth0 >> $ ip route add ::/0 via 2001:db8::5 dev eth0 table 20 >> $ ip route add 2001:db8:cafe::1/128 via 2001:db8::6 dev eth0 table 20 >> RTNETLINK answers: No route to host >> $ ip -6 route show table 20 >> default via 2001:db8::5 dev eth0 metric 1024 pref medium > > so your table 20 is not complete in that it lacks a connected route to > resolve 2001:db8::6 as a nexthop, so you are relying on a fallback to > other tables (main in this case). Yes. >> @@ -1991,33 +2015,15 @@ static struct rt6_info *ip6_route_info_create(struct >> fib6_config *cfg) >> if (!(gwa_type & IPV6_ADDR_UNICAST)) >> goto out; >> >> +err = -EHOSTUNREACH; >> if (cfg->fc_table) >> grt = ip6_nh_lookup_table(net, cfg, gw_addr); > > -8<- > >> -if (!(grt->rt6i_flags & RTF_GATEWAY)) >> -err = 0; > > This is the check that is failing for your use > case. ip6_nh_lookup_table is returning the default route and nexthops > can not rely on a gateway. Given that a simpler and more direct change > is (whitespace mangled on paste): > > diff --git a/net/ipv6/route.c b/net/ipv6/route.c > index ad4a7ff301fc..48bae2ee2e18 100644 > --- a/net/ipv6/route.c > +++ b/net/ipv6/route.c > @@ -1991,9 +1991,19 @@ static struct rt6_info *ip6_route_info_create(struct > fib6_config *cfg) > if (!(gwa_type & IPV6_ADDR_UNICAST)) > goto out; > > - if (cfg->fc_table) > + if (cfg->fc_table) { > grt = ip6_nh_lookup_table(net, cfg, gw_addr); > > + /* a nexthop lookup can not go through a gw. > +* if this happens on a table based lookup > +* then fallback to a full lookup > +*/ > + if (grt && grt->rt6i_flags & RTF_GATEWAY) { > + ip6_rt_put(grt); > + grt = NULL; > + } > + } > + > if (!grt) > grt = rt6_lookup(net, gw_addr, NULL, > cfg->fc_ifindex, 1); OK. Should the dev check be dismissed or do we add "dev && dev != grt->dst.dev" just as a safety net (this would be a convulated setup, but the correct direct route could be in an ip rule with higher priority while the one in this table is incorrect)? -- "... an experienced, industrious, ambitious, and often quite often picturesque liar." -- Mark Twain
[PATCH] net: ipv6: fallback to full lookup if table lookup is unsuitable
Commit 8c14586fc320 ("net: ipv6: Use passed in table for nexthop lookups") introduced a regression: insertion of an IPv6 route in a table not containing the appropriate connected route for the gateway but which contained a non-connected route (like a default gateway) fails while it was previously working: $ ip link add eth0 type dummy $ ip link set up dev eth0 $ ip addr add 2001:db8::1/64 dev eth0 $ ip route add ::/0 via 2001:db8::5 dev eth0 table 20 $ ip route add 2001:db8:cafe::1/128 via 2001:db8::6 dev eth0 table 20 RTNETLINK answers: No route to host $ ip -6 route show table 20 default via 2001:db8::5 dev eth0 metric 1024 pref medium After this patch, we get: $ ip route add 2001:db8:cafe::1/128 via 2001:db8::6 dev eth0 table 20 $ ip -6 route show table 20 2001:db8:cafe::1 via 2001:db8::6 dev eth0 metric 1024 pref medium default via 2001:db8::5 dev eth0 metric 1024 pref medium Signed-off-by: Vincent Bernat <vinc...@bernat.im> --- net/ipv6/route.c | 48 +++- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ad4a7ff301fc..c2aaddcfed9e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1808,6 +1808,30 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net, return rt; } +static int ip6_nh_valid(struct rt6_info *grt, + struct net_device **dev, struct inet6_dev **idev) { + int ret = 0; + + if (!grt) + goto out; + if (grt->rt6i_flags & RTF_GATEWAY) + goto out; + if (*dev) { + if (*dev != grt->dst.dev) + goto out; + } else { + *dev = grt->dst.dev; + *idev = grt->rt6i_idev; + dev_hold(*dev); + in6_dev_hold(*idev); + } + ret = 1; +out: + if (grt) + ip6_rt_put(grt); + return ret; +} + static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) { struct net *net = cfg->fc_nlinfo.nl_net; @@ -1991,33 +2015,15 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) if (!(gwa_type & IPV6_ADDR_UNICAST)) goto out; + err = -EHOSTUNREACH; if (cfg->fc_table) grt = ip6_nh_lookup_table(net, cfg, gw_addr); - - if (!grt) + if (!ip6_nh_valid(grt, , )) { grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1); - - err = -EHOSTUNREACH; - if (!grt) - goto out; - if (dev) { - if (dev != grt->dst.dev) { - ip6_rt_put(grt); + if (!ip6_nh_valid(grt, , )) goto out; - } - } else { - dev = grt->dst.dev; - idev = grt->rt6i_idev; - dev_hold(dev); - in6_dev_hold(grt->rt6i_idev); } - if (!(grt->rt6i_flags & RTF_GATEWAY)) - err = 0; - ip6_rt_put(grt); - - if (err) - goto out; } err = -EINVAL; if (!dev || (dev->flags & IFF_LOOPBACK)) -- 2.9.3
Re: [PATCH v5 0/6] Add eBPF hooks for cgroups
❦ 12 septembre 2016 18:12 CEST, Daniel Mack: > * The sample program learned to support both ingress and egress, and > can now optionally make the eBPF program drop packets by making it > return 0. Ability to lock the eBPF program to avoid modification from a later program or in a subcgroup would be pretty interesting from a security perspective. -- Use recursive procedures for recursively-defined data structures. - The Elements of Programming Style (Kernighan & Plauger)
Re: [net v1] fib_rules: interface group matching
❦ 14 septembre 2016 17:25 CEST, David Ahern: >> I could just give more time to VRF. I also had some concerns over >> performance with the way Netfilter integration is done, but I understand >> that I could just stay away from POSTROUTING rules which is the only >> hook executed twice? > With the changes that were committed this past weekend, the VRF code > is now setup where I can set a flag on a per VRF basis to disable the > extra rx and tx processing - ie., no network taps, no netfilter, no > qdisc, etc. Drops the overhead of VRF to ~3% maybe a bit less. I need > to think about the user api a bit more and formalize the patch. Given > my other commitments that probably won't happen until mid-October. But > in terms of a building block, the overhead of VRF is continuing to > drop. Fine by me. We can drop my patch. Thanks! -- Program defensively. - The Elements of Programming Style (Kernighan & Plauger)
Re: [net v1] fib_rules: interface group matching
❦ 14 septembre 2016 16:39 CEST, David Ahern: When a user wants to assign a routing table to a group of incoming interfaces, the current solutions are: - one IP rule for each interface (scalability problems) - use of fwmark and devgroup matcher (don't work with internal route lookups, used for example by RPF) - use of VRF devices (more complex) >>> >>> Why do you believe that? A VRF is a formalized grouping of interfaces >>> that includes an API for locally generated traffic to specify which >>> VRF/group to use. And, with the l3mdev rule you only need 1 rule for >>> all VRFs regardless of the number which is the best solution to the >>> scalability problem of adding rules per device/group/VRF. >>> >>> What use case are trying to solve? >> >> Local processes have to be made aware of the VRF by binding to the >> pseudo-device. Some processes may be tricked by LD_PRELOAD but some >> won't (like stuff written in Go). Maybe I should just find a better way >> to bind a process to a VRF without its cooperation. > > What API are you using for interface groups? How does an app tell the > kernel to use interface group 1 versus group 2? In my testbed, I have only one local application which is dnsmasq as a DHCP server. It sends back the answer to the physical interface (with sendmsg() and auxillary data). So it makes my argument a bit moot as the situation is in fact worse without VRF. :-/ My testbed is here (with use of VRF, more recent commits just use plain ip rules): https://github.com/vincentbernat/network-lab/blob/d86e9ed658863ef0f51d7b853d0dc9f8b7427b21/lab-l3-hyperv/setup I could just give more time to VRF. I also had some concerns over performance with the way Netfilter integration is done, but I understand that I could just stay away from POSTROUTING rules which is the only hook executed twice? -- All things that are, are with more spirit chased than enjoyed. -- Shakespeare, "Merchant of Venice"
Re: [net v1] fib_rules: interface group matching
❦ 14 septembre 2016 16:15 CEST, David Ahern: >> When a user wants to assign a routing table to a group of incoming >> interfaces, the current solutions are: >> >> - one IP rule for each interface (scalability problems) >> - use of fwmark and devgroup matcher (don't work with internal route >>lookups, used for example by RPF) >> - use of VRF devices (more complex) > > Why do you believe that? A VRF is a formalized grouping of interfaces > that includes an API for locally generated traffic to specify which > VRF/group to use. And, with the l3mdev rule you only need 1 rule for > all VRFs regardless of the number which is the best solution to the > scalability problem of adding rules per device/group/VRF. > > What use case are trying to solve? Local processes have to be made aware of the VRF by binding to the pseudo-device. Some processes may be tricked by LD_PRELOAD but some won't (like stuff written in Go). Maybe I should just find a better way to bind a process to a VRF without its cooperation. -- Instrument your programs. Measure before making "efficiency" changes. - The Elements of Programming Style (Kernighan & Plauger)
Re: [net v1] fib_rules: interface group matching
❦ 14 septembre 2016 14:40 CEST, Vincent Bernat <vinc...@bernat.im> : > Each interface can be assigned to a numeric group using IFLA_GROUP. This > commit enables a user to reference such a group into an IP rule. Here is > an example of output of iproute2: > > $ ip rule show > 0: from all lookup local > 32764: from all iifgroup 2 lookup 2 > 32765: from all iifgroup 1 lookup 1 > 32766: from all lookup main > 32767: from all lookup default The patch for iproute2 is available here (didn't post it inline to avoid confuse patchwork): http://paste.debian.net/821247/ -- Say what you mean, simply and directly. - The Elements of Programming Style (Kernighan & Plauger)
[net v1] fib_rules: interface group matching
When a user wants to assign a routing table to a group of incoming interfaces, the current solutions are: - one IP rule for each interface (scalability problems) - use of fwmark and devgroup matcher (don't work with internal route lookups, used for example by RPF) - use of VRF devices (more complex) Each interface can be assigned to a numeric group using IFLA_GROUP. This commit enables a user to reference such a group into an IP rule. Here is an example of output of iproute2: $ ip rule show 0: from all lookup local 32764: from all iifgroup 2 lookup 2 32765: from all iifgroup 1 lookup 1 32766: from all lookup main 32767: from all lookup default Signed-off-by: Vincent Bernat <vinc...@bernat.im> --- include/net/fib_rules.h| 6 - include/uapi/linux/fib_rules.h | 2 ++ net/core/fib_rules.c | 57 +- 3 files changed, 63 insertions(+), 2 deletions(-) diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 456e4a6006ab..a96b186ccd02 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -28,6 +28,8 @@ struct fib_rule { u32 pref; int suppress_ifgroup; int suppress_prefixlen; + int iifgroup; + int oifgroup; chariifname[IFNAMSIZ]; charoifname[IFNAMSIZ]; struct rcu_head rcu; @@ -92,7 +94,9 @@ struct fib_rules_ops { [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, \ [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \ [FRA_GOTO] = { .type = NLA_U32 }, \ - [FRA_L3MDEV]= { .type = NLA_U8 } + [FRA_L3MDEV]= { .type = NLA_U8 }, \ + [FRA_IIFGROUP] = { .type = NLA_U32 }, \ + [FRA_OIFGROUP] = { .type = NLA_U32 } static inline void fib_rule_get(struct fib_rule *rule) { diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h index 14404b3ebb89..0bf5a5e94d9a 100644 --- a/include/uapi/linux/fib_rules.h +++ b/include/uapi/linux/fib_rules.h @@ -51,6 +51,8 @@ enum { FRA_OIFNAME, FRA_PAD, FRA_L3MDEV, /* iif or oif is l3mdev goto its table */ + FRA_IIFGROUP, /* interface group */ + FRA_OIFGROUP, __FRA_MAX }; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index be4629c344a6..f8ed6ba85c72 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -37,6 +37,9 @@ int fib_default_rule_add(struct fib_rules_ops *ops, r->suppress_prefixlen = -1; r->suppress_ifgroup = -1; + r->iifgroup = -1; + r->oifgroup = -1; + /* The lock is not required here, the list in unreacheable * at the moment this function is called */ list_add_tail(>list, >rules_list); @@ -193,6 +196,30 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg)) goto out; + if (rule->iifgroup != -1) { + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(rule->fr_net, fl->flowi_iif); + if (!dev || dev->group != rule->iifgroup) { + rcu_read_unlock(); + goto out; + } + rcu_read_unlock(); + } + + if (rule->oifgroup != -1) { + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(rule->fr_net, fl->flowi_oif); + if (!dev || dev->group != rule->oifgroup) { + rcu_read_unlock(); + goto out; + } + rcu_read_unlock(); + } + ret = ops->match(rule, fl, flags); out: return (rule->flags & FIB_RULE_INVERT) ? !ret : ret; @@ -305,6 +332,12 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, if (r->l3mdev != rule->l3mdev) continue; + if (r->iifgroup != rule->iifgroup) + continue; + + if (r->oifgroup != rule->oifgroup) + continue; + if (!ops->compare(r, frh, tb)) continue; return 1; @@ -391,6 +424,16 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh) goto errout_free; } + if (tb[FRA_IIFGROUP]) + rule->iifgroup = nla_get_u32(tb[FRA_IIFGROUP]); + else + rule->iifgroup = -1; + + if (tb[FRA_OIFGROUP]) + rule->oifgroup = nla_get_u32(tb[FRA_OIFGROUP]); + else + rule->oifgroup = -1; + rule-&
Re: [net v4] veth: advertise peer link once both links are tied together
❦ 31 mai 2016 08:30 CEST, Vincent Bernat <vinc...@bernat.im> : > diff --git a/drivers/net/veth.c b/drivers/net/veth.c > index f37a6e61d4ad..aaa1b023b9f2 100644 > --- a/drivers/net/veth.c > +++ b/drivers/net/veth.c > @@ -466,6 +466,8 @@ static int veth_newlink(struct net *src_net, struct > net_device *dev, > > priv = netdev_priv(peer); > rcu_assign_pointer(priv->peer, dev); > + > + rtmsg_ifinfo(RTM_NEWLINK, peer, ~0U, GFP_KERNEL); > return 0; Well, this is wrong here too because the NEWLINK message for dev was not issued either (link state is not initialized), so we are back to square one. 3: veth0@NONE: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group default link/ether fa:ba:12:26:99:00 brd ff:ff:ff:ff:ff:ff 3: veth0@veth1: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group default link/ether fa:ba:12:26:99:00 brd ff:ff:ff:ff:ff:ff 4: veth1@veth0: <BROADCAST,MULTICAST,M-DOWN> mtu 1500 qdisc noop state DOWN group default link/ether ea:e4:e2:26:3c:87 brd ff:ff:ff:ff:ff:ff The netlink message for veth1 is sent by rtnl_newlink() after invocation of veth_newlink(). So, I don't see any easy way to fix that. Maybe not worth it. -- Let the data structure the program. - The Elements of Programming Style (Kernighan & Plauger)
[net v4] veth: advertise peer link once both links are tied together
When the peer link is created, its "iflink" information is not advertised through Netlink. Once created, the local device is advertised with this information but if a user is maintaining a cache from all updates, it will still miss the iflink for the peer link: 2: veth0@NONE: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group default link/ether ae:0e:08:af:fb:a0 brd ff:ff:ff:ff:ff:ff 3: veth1@veth0: <BROADCAST,MULTICAST,M-DOWN> mtu 1500 qdisc noop state DOWN group default link/ether 3a:31:f1:36:2e:e5 brd ff:ff:ff:ff:ff:ff With this patch, we advertise again the peer link to let any user pick the appropriate iflink information: 3: veth0@NONE: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group default link/ether fa:ba:12:26:99:00 brd ff:ff:ff:ff:ff:ff 3: veth0@veth1: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group default link/ether fa:ba:12:26:99:00 brd ff:ff:ff:ff:ff:ff 4: veth1@veth0: <BROADCAST,MULTICAST,M-DOWN> mtu 1500 qdisc noop state DOWN group default link/ether ea:e4:e2:26:3c:87 brd ff:ff:ff:ff:ff:ff Signed-off-by: Vincent Bernat <vinc...@bernat.im> --- v4: - use ~0U instead of IFF_SLAVE for ifi_change v3: - send an additional netlink messages once the peer link is tied to avoid any chicken/egg problem v2: - ensure the device is unregistered in case of link configuration failure drivers/net/veth.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index f37a6e61d4ad..aaa1b023b9f2 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -466,6 +466,8 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, priv = netdev_priv(peer); rcu_assign_pointer(priv->peer, dev); + + rtmsg_ifinfo(RTM_NEWLINK, peer, ~0U, GFP_KERNEL); return 0; err_register_dev: -- 2.8.1
Re: [net v3] veth: advertise peer link once both links are tied together
❦ 30 mai 2016 18:27 CEST, Nicolas Dichtel: >>> + >>> + rtmsg_ifinfo(RTM_NEWLINK, peer, IFF_SLAVE, GFP_KERNEL); >> >> Maybe ~0U would be better than hijacking IFF_SLAVE? > IFF_SLAVE is wrong. It's a flag here, that will be put in the ifi_change field > not an attribute number. There are some use of IFF_SLAVE (in bonding/bond_main.c). But, I'll update the patch nonetheless. -- Use free-form input when possible. - The Elements of Programming Style (Kernighan & Plauger)
Re: [net v3] veth: advertise peer link once both links are tied together
❦ 30 mai 2016 17:58 CEST, Vincent Bernat <vinc...@bernat.im> : > + > + rtmsg_ifinfo(RTM_NEWLINK, peer, IFF_SLAVE, GFP_KERNEL); Maybe ~0U would be better than hijacking IFF_SLAVE? -- Anyone who has had a bull by the tail knows five or six more things than someone who hasn't. -- Mark Twain
[net v3] veth: advertise peer link once both links are tied together
When the peer link is created, its "iflink" information is not advertised through Netlink. Once created, the local device is advertised with this information but if a user is maintaining a cache from all updates, it will still miss the iflink for the peer link: 2: veth0@NONE: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group default link/ether ae:0e:08:af:fb:a0 brd ff:ff:ff:ff:ff:ff 3: veth1@veth0: <BROADCAST,MULTICAST,M-DOWN> mtu 1500 qdisc noop state DOWN group default link/ether 3a:31:f1:36:2e:e5 brd ff:ff:ff:ff:ff:ff With this patch, we advertise again the peer link to let any user pick the appropriate iflink information: 3: veth0@NONE: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group default link/ether fa:ba:12:26:99:00 brd ff:ff:ff:ff:ff:ff 3: veth0@veth1: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group default link/ether fa:ba:12:26:99:00 brd ff:ff:ff:ff:ff:ff 4: veth1@veth0: <BROADCAST,MULTICAST,M-DOWN> mtu 1500 qdisc noop state DOWN group default link/ether ea:e4:e2:26:3c:87 brd ff:ff:ff:ff:ff:ff Signed-off-by: Vincent Bernat <vinc...@bernat.im> --- v3: - send an additional netlink messages once the peer link is tied to avoid any chicken/egg problem v2: - ensure the device is unregistered in case of link configuration failure drivers/net/veth.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index f37a6e61d4ad..2376d17b8f53 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -466,6 +466,8 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, priv = netdev_priv(peer); rcu_assign_pointer(priv->peer, dev); + + rtmsg_ifinfo(RTM_NEWLINK, peer, IFF_SLAVE, GFP_KERNEL); return 0; err_register_dev: -- 2.8.1
Re: [PATCH] veth: delay peer link configuration after interfaces are tied
❦ 30 mai 2016 17:19 CEST, Nicolas Dichtel: priv = netdev_priv(peer); rcu_assign_pointer(priv->peer, dev); + + err = rtnl_configure_link(peer, ifmp); + if (err < 0) + goto err_configure_peer; >> >>> You should fix the error path. 'unregister_netdevice(dev)' is missing. >> >> I am sending another patch to fix that. I am quite unsure if I do the >> right thing here. >> > A less intrusive fix is to call 'rtmsg_ifinfo(RTM_NEWLINK, peer, ~0U, > GFP_KERNEL);' a the end of veth_newlink(). I did that at first. Maybe this would make more sense to do that. Otherwise, the first message contains an iflink value that we cannot resolve with just the received netlink messages (since the information is in the next netlink message). "ip monitor" seems to be able to get the info, but I suppose it does an additional lookup. -- Make sure every module hides something. - The Elements of Programming Style (Kernighan & Plauger)
Re: [PATCH] veth: delay peer link configuration after interfaces are tied
❦ 30 mai 2016 12:12 CEST, Vincent Bernat <vinc...@bernat.im> : > When the peer link is created, its "iflink" information is not [...] And that's the wrong patch... Please, ignore this one. -- Don't stop with your first draft. - The Elements of Programming Style (Kernighan & Plauger)
Re: [PATCH] veth: delay peer link configuration after interfaces are tied
❦ 30 mai 2016 11:23 CEST, Nicolas Dichtel: >> @@ -466,6 +462,10 @@ static int veth_newlink(struct net *src_net, struct >> net_device *dev, >> >> priv = netdev_priv(peer); >> rcu_assign_pointer(priv->peer, dev); >> + >> +err = rtnl_configure_link(peer, ifmp); >> +if (err < 0) >> +goto err_configure_peer; > You should fix the error path. 'unregister_netdevice(dev)' is missing. I am sending another patch to fix that. I am quite unsure if I do the right thing here. -- Don't stop with your first draft. - The Elements of Programming Style (Kernighan & Plauger)
[PATCH] veth: delay peer link configuration after interfaces are tied
When the peer link is created, its "iflink" information is not advertised through netlink. If a user is maintaining a cache from all updates, it will miss this information: 2: veth0@NONE: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group default link/ether ae:0e:08:af:fb:a0 brd ff:ff:ff:ff:ff:ff 3: veth1@veth0: <BROADCAST,MULTICAST,M-DOWN> mtu 1500 qdisc noop state DOWN group default link/ether 3a:31:f1:36:2e:e5 brd ff:ff:ff:ff:ff:ff To avoid this situation, the peer link is only configured after both interfaces are tied together: 3: veth0@veth1: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group default link/ether ee:0d:80:46:36:fe brd ff:ff:ff:ff:ff:ff 4: veth1@veth0: <BROADCAST,MULTICAST,M-DOWN> mtu 1500 qdisc noop state DOWN group default link/ether ba:25:bc:7a:0d:c8 brd ff:ff:ff:ff:ff:ff Signed-off-by: Vincent Bernat <vinc...@bernat.im> --- drivers/net/veth.c | 13 + 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index f37a6e61d4ad..7580f6765948 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -432,10 +432,6 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, netif_carrier_off(peer); - err = rtnl_configure_link(peer, ifmp); - if (err < 0) - goto err_configure_peer; - /* * register dev last * @@ -466,8 +462,17 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, priv = netdev_priv(peer); rcu_assign_pointer(priv->peer, dev); + + err = rtnl_configure_link(peer, ifmp); + if (err < 0) + goto err_configure_link; return 0; +err_configure_link: + RCU_INIT_POINTER(priv->peer, NULL); + priv = netdev_priv(dev); + RCU_INIT_POINTER(priv->peer, NULL); + unregister_netdevice(dev); err_register_dev: /* nothing to do */ err_configure_peer: -- 2.8.1
[PATCH] veth: delay peer link configuration after interfaces are tied
When the peer link is created, its "iflink" information is not advertised through netlink. If a user is maintaining a cache from all updates, it will miss this information: 2: veth0@NONE: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group default link/ether ae:0e:08:af:fb:a0 brd ff:ff:ff:ff:ff:ff 3: veth1@veth0: <BROADCAST,MULTICAST,M-DOWN> mtu 1500 qdisc noop state DOWN group default link/ether 3a:31:f1:36:2e:e5 brd ff:ff:ff:ff:ff:ff To avoid this situation, the peer link is only configured after both interfaces are tied together: 3: veth0@veth1: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group default link/ether ee:0d:80:46:36:fe brd ff:ff:ff:ff:ff:ff 4: veth1@veth0: <BROADCAST,MULTICAST,M-DOWN> mtu 1500 qdisc noop state DOWN group default link/ether ba:25:bc:7a:0d:c8 brd ff:ff:ff:ff:ff:ff Signed-off-by: Vincent Bernat <vinc...@bernat.im> --- drivers/net/veth.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index f37a6e61d4ad..9726c4dbf659 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -432,10 +432,6 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, netif_carrier_off(peer); - err = rtnl_configure_link(peer, ifmp); - if (err < 0) - goto err_configure_peer; - /* * register dev last * @@ -466,6 +462,10 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, priv = netdev_priv(peer); rcu_assign_pointer(priv->peer, dev); + + err = rtnl_configure_link(peer, ifmp); + if (err < 0) + goto err_configure_peer; return 0; err_register_dev: -- 2.8.1
[PATCH] veth: delay peer link configuration after interfaces are tied
When the peer link is created, its "iflink" information is not advertised through netlink. If a user is maintaining a cache from all updates, it will miss this information: 2: veth0@NONE: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group default link/ether ae:0e:08:af:fb:a0 brd ff:ff:ff:ff:ff:ff 3: veth1@veth0: <BROADCAST,MULTICAST,M-DOWN> mtu 1500 qdisc noop state DOWN group default link/ether 3a:31:f1:36:2e:e5 brd ff:ff:ff:ff:ff:ff To avoid this situation, the peer link is only configured after both interfaces are tied together: 3: veth0@veth1: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group default link/ether ee:0d:80:46:36:fe brd ff:ff:ff:ff:ff:ff 4: veth1@veth0: <BROADCAST,MULTICAST,M-DOWN> mtu 1500 qdisc noop state DOWN group default link/ether ba:25:bc:7a:0d:c8 brd ff:ff:ff:ff:ff:ff Signed-off-by: Vincent Bernat <vinc...@bernat.im> --- drivers/net/veth.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index f37a6e61d4ad..9726c4dbf659 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -432,10 +432,6 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, netif_carrier_off(peer); - err = rtnl_configure_link(peer, ifmp); - if (err < 0) - goto err_configure_peer; - /* * register dev last * @@ -466,6 +462,10 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, priv = netdev_priv(peer); rcu_assign_pointer(priv->peer, dev); + + err = rtnl_configure_link(peer, ifmp); + if (err < 0) + goto err_configure_peer; return 0; err_register_dev: -- 2.8.1
Re: [PATCH v2] socket.7: Document some BPF-related socket options
❦ 1 mars 2016 11:03 +0100, "Michael Kerrisk (man-pages)": > Once the SO_LOCK_FILTER option has been enabled, > attempts by an unprivileged process to change or remove > the filter attached to a socket, or to disable the > SO_LOCK_FILTER option will fail with the error EPERM. You should remove "unprivileged". I didn't try to check for permissions because I was just lazy (and I didn't have a need for it). As root, you can just recreate another socket. -- Choose a data representation that makes the program simple. - The Elements of Programming Style (Kernighan & Plauger)
Re: [PATCH] veth: replace iflink by a dedicated symlink in sysfs
❦ 20 août 2015 14:07 -0700, David Miller da...@davemloft.net : I also don't know what is the best way to handle this. veth advertises its peer via IFLA_LINK since 4.1, so it's too late to change it for this release. Apparently we need to pick our poison. Either way, we break something. Sure. I would prefer to have the same mechanism in all version, but I can live with the other solution. David, any thoughts about this? You can't break the 4.1 semantics, it's in a released kernel and people _ARE_ using it. I had a look at what other kind of daemons may exploit the pre-4.1 semantics (of not having an infinite loop when following iflink) and failed to find any other users than lldpd. Other LLDP daemons (lldpad, ladvd, openlldpd) have other ways to find the lower interface. I would also have thought that NetSNMP would use it to implement ifStackTable but it doesn't in fact implement this table. -- It were not best that we should all think alike; it is difference of opinion that makes horse-races. -- Mark Twain, Pudd'nhead Wilson's Calendar -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] veth: replace iflink by a dedicated symlink in sysfs
❦ 19 août 2015 14:38 +0200, Jiri Benc jb...@redhat.com : That's the main goal of this patch: advertising the peer link as IFLA_LINK attribute triggers an infinite loop in userland software when they follow iflink to discover network devices topology. iflink has always been the index of a lower device. If a sysfs symbolic link is not good enough, I can propose a new IFLA_PEER attribute instead. This would cause regression and break applications for those of us who started relying on the netnsid feature to match interfaces across net name spaces. Yes. Unfortunately. This is tough. If you're going to do such thing, you would at least need to also introduce IFLA_PEER_NETNSID. Yes I can. In my opinion, the change of semantics of IFLA_LINK is a break of API. However, I can live with it since it's easy to workaround it. It just seemed easier to start the discussion with a patch. -- Parenthesise to avoid ambiguity. - The Elements of Programming Style (Kernighan Plauger) -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Regression in 4.1 with veth and IFLA_LINK
Here is a proposed patch to fix this by providing a symlink to the peer instead. This may not totally replace the use of iflink since the peer won't be available through netlink anymore but maybe this is good enough for the intended use. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] veth: replace iflink by a dedicated symlink in sysfs
While the documentation doesn't say exactly what kind of relationship iflink should represent, until a45253, only lower devices were advertised this way. While veth cannot have a lower device, using iflink to advertise the peer may create infinite loops in programs using iflink to discover device topology. Instead of advertising the peer link with iflink, a symbolic link peer is added to each peer. Signed-off-by: Vincent Bernat vinc...@bernat.im --- drivers/net/veth.c | 40 +--- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index c8186ffda1a3..47f165bc3107 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -105,6 +105,17 @@ static const struct ethtool_ops veth_ethtool_ops = { .get_ethtool_stats = veth_get_ethtool_stats, }; +static int veth_peer_sysfs_add(struct net_device *dev, + struct net_device *peer_dev) +{ + return sysfs_create_link((dev-dev.kobj), (peer_dev-dev.kobj), +peer); +} +static void veth_peer_sysfs_del(struct net_device *dev) +{ + sysfs_remove_link((dev-dev.kobj), peer); +} + static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); @@ -263,20 +274,6 @@ static void veth_poll_controller(struct net_device *dev) } #endif /* CONFIG_NET_POLL_CONTROLLER */ -static int veth_get_iflink(const struct net_device *dev) -{ - struct veth_priv *priv = netdev_priv(dev); - struct net_device *peer; - int iflink; - - rcu_read_lock(); - peer = rcu_dereference(priv-peer); - iflink = peer ? peer-ifindex : 0; - rcu_read_unlock(); - - return iflink; -} - static const struct net_device_ops veth_netdev_ops = { .ndo_init= veth_dev_init, .ndo_open= veth_open, @@ -289,7 +286,6 @@ static const struct net_device_ops veth_netdev_ops = { #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller= veth_poll_controller, #endif - .ndo_get_iflink = veth_get_iflink, }; #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_ALL_TSO |\ @@ -436,6 +432,13 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, netif_carrier_off(dev); + err = veth_peer_sysfs_add(dev, peer); + if (err 0) + goto err_configure_dev; + err = veth_peer_sysfs_add(peer, dev); + if (err 0) + goto err_configure_dev; + /* * tie the deviced together */ @@ -447,9 +450,13 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, rcu_assign_pointer(priv-peer, dev); return 0; +err_configure_dev: + veth_peer_sysfs_del(dev); + veth_peer_sysfs_del(peer); err_register_dev: /* nothing to do */ err_configure_peer: + veth_peer_sysfs_del(dev); unregister_netdevice(peer); return err; @@ -466,6 +473,9 @@ static void veth_dellink(struct net_device *dev, struct list_head *head) priv = netdev_priv(dev); peer = rtnl_dereference(priv-peer); + veth_peer_sysfs_del(dev); + if (peer) veth_peer_sysfs_del(dev); + /* Note : dellink() is called from default_device_exit_batch(), * before a rcu_synchronize() point. The devices are guaranteed * not being freed before one RCU grace period. -- 2.5.0 -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html