With the following tweaks to the IPv4 stack: - enslaving devices to a VRF device automatically moves routes to the VRF table; removing the VRF master moves routes back to the main table
- the following use cases work for both Rx and Tx: + ICMP (ping -I <vrf-device> <ip>) + TCP server and client bound to VRF device + TCP server not bound to VRF device but working through it * client connections are bound to VRF device + UDP server and client bound to VRF device Signed-off-by: Shrijeet Mukherjee <s...@cumulusnetworks.com> Signed-off-by: David Ahern <d...@cumulusnetworks.com> --- include/net/flow.h | 1 + include/net/inet_hashtables.h | 9 +++++++-- include/net/route.h | 4 ++++ net/ipv4/fib_frontend.c | 30 ++++++++++++++++++++---------- net/ipv4/fib_semantics.c | 25 ++++++++++++++++++++----- net/ipv4/fib_trie.c | 7 +++++-- net/ipv4/icmp.c | 4 ++++ net/ipv4/ping.c | 3 ++- net/ipv4/raw.c | 5 +++-- net/ipv4/route.c | 12 ++++++++++-- net/ipv4/syncookies.c | 4 +++- net/ipv4/tcp_input.c | 6 +++++- net/ipv4/tcp_ipv4.c | 6 ++++-- net/ipv4/udp.c | 2 ++ 14 files changed, 90 insertions(+), 28 deletions(-) diff --git a/include/net/flow.h b/include/net/flow.h index 8109a159d1b3..69aaa99fdeb8 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -29,6 +29,7 @@ struct flowi_common { __u8 flowic_flags; #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_KNOWN_NH 0x02 +#define FLOWI_FLAG_VRFSRC 0x04 __u32 flowic_secid; }; diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index b73c88a19dd4..e26c43823a13 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -31,6 +31,7 @@ #include <net/route.h> #include <net/tcp_states.h> #include <net/netns/hash.h> +#include <net/vrf.h> #include <linux/atomic.h> #include <asm/byteorder.h> @@ -300,10 +301,14 @@ static inline struct sock *__inet_lookup(struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, - const int dif) + int dif) { u16 hnum = ntohs(dport); - struct sock *sk = __inet_lookup_established(net, hashinfo, + struct sock *sk; + + dif = vrf_get_master_dev_idx(net, dif) ? : dif; + + sk = __inet_lookup_established(net, hashinfo, saddr, sport, daddr, hnum, dif); return sk ? : __inet_lookup_listener(net, hashinfo, saddr, sport, diff --git a/include/net/route.h b/include/net/route.h index fe22d03afb6a..460333bab217 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -188,6 +188,7 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk); void ip_rt_send_redirect(struct sk_buff *skb); unsigned int inet_addr_type(struct net *net, __be32 addr); +unsigned int inet_addr_type_table(struct net *net, __be32 addr, int tb_id); unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr); void ip_rt_multicast_event(struct in_device *); @@ -250,6 +251,9 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32 if (inet_sk(sk)->transparent) flow_flags |= FLOWI_FLAG_ANYSRC; + if (netif_idx_is_vrf(sock_net(sk), oif)) + flow_flags |= FLOWI_FLAG_VRFSRC; + flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, protocol, flow_flags, dst, src, dport, sport); } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 974fa51effca..7c73eb058c91 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -45,6 +45,7 @@ #include <net/ip_fib.h> #include <net/rtnetlink.h> #include <net/xfrm.h> +#include <net/vrf.h> #ifndef CONFIG_IP_MULTIPLE_TABLES @@ -212,7 +213,7 @@ void fib_flush_external(struct net *net) */ static inline unsigned int __inet_dev_addr_type(struct net *net, const struct net_device *dev, - __be32 addr) + __be32 addr, int rt_table) { struct flowi4 fl4 = { .daddr = addr }; struct fib_result res; @@ -225,8 +226,7 @@ static inline unsigned int __inet_dev_addr_type(struct net *net, return RTN_MULTICAST; rcu_read_lock(); - - local_table = fib_get_table(net, RT_TABLE_LOCAL); + local_table = fib_get_table(net, rt_table); if (local_table) { ret = RTN_UNICAST; if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) { @@ -239,16 +239,24 @@ static inline unsigned int __inet_dev_addr_type(struct net *net, return ret; } +unsigned int inet_addr_type_table(struct net *net, __be32 addr, int tb_id) +{ + return __inet_dev_addr_type(net, NULL, addr, tb_id); +} +EXPORT_SYMBOL(inet_addr_type_table); + unsigned int inet_addr_type(struct net *net, __be32 addr) { - return __inet_dev_addr_type(net, NULL, addr); + return __inet_dev_addr_type(net, NULL, addr, RT_TABLE_LOCAL); } EXPORT_SYMBOL(inet_addr_type); unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr) { - return __inet_dev_addr_type(net, dev, addr); + int rt_table = vrf_dev_table(dev) ? : RT_TABLE_LOCAL; + + return __inet_dev_addr_type(net, dev, addr, rt_table); } EXPORT_SYMBOL(inet_dev_addr_type); @@ -309,7 +317,9 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, bool dev_match; fl4.flowi4_oif = 0; - fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; + fl4.flowi4_iif = vrf_master_dev_idx(dev); + if (!fl4.flowi4_iif) + fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; fl4.daddr = src; fl4.saddr = dst; fl4.flowi4_tos = tos; @@ -761,6 +771,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa) { struct net *net = dev_net(ifa->ifa_dev->dev); + int tb_id = vrf_dev_table(ifa->ifa_dev->dev); struct fib_table *tb; struct fib_config cfg = { .fc_protocol = RTPROT_KERNEL, @@ -775,11 +786,10 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad }, }; - if (type == RTN_UNICAST) - tb = fib_new_table(net, RT_TABLE_MAIN); - else - tb = fib_new_table(net, RT_TABLE_LOCAL); + if (!tb_id) + tb_id = (type == RTN_UNICAST) ? RT_TABLE_MAIN : RT_TABLE_LOCAL; + tb = fib_new_table(net, tb_id); if (!tb) return; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 3bfccd83551c..3c3e2006ce72 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -760,6 +760,23 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh) return nh->nh_saddr; } +static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) +{ + if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || + fib_prefsrc != cfg->fc_dst) { + int tb_id = cfg->fc_table; + + if (tb_id == RT_TABLE_MAIN) + tb_id = RT_TABLE_LOCAL; + + if (inet_addr_type_table(cfg->fc_nlinfo.nl_net, + fib_prefsrc, tb_id) != RTN_LOCAL) { + return false; + } + } + return true; +} + struct fib_info *fib_create_info(struct fib_config *cfg) { int err; @@ -940,11 +957,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg) fi->fib_flags |= RTNH_F_LINKDOWN; } - if (fi->fib_prefsrc) { - if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || - fi->fib_prefsrc != cfg->fc_dst) - if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL) - goto err_inval; + + if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc)) { + goto err_inval; } change_nexthops(fi) { diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index ac2d828c6daa..7da901c56e35 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1421,8 +1421,11 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, nh->nh_flags & RTNH_F_LINKDOWN && !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) continue; - if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) - continue; + if (!(flp->flowi4_flags & FLOWI_FLAG_VRFSRC)) { + if (flp->flowi4_oif && + flp->flowi4_oif != nh->nh_oif) + continue; + } if (!(fib_flags & FIB_LOOKUP_NOREF)) atomic_inc(&fi->fib_clntref); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index f5203fba6236..115d3c1c548f 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -96,6 +96,7 @@ #include <net/xfrm.h> #include <net/inet_common.h> #include <net/ip_fib.h> +#include <net/vrf.h> /* * Build xmit assembly blocks @@ -425,6 +426,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) fl4.flowi4_mark = mark; fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_proto = IPPROTO_ICMP; + fl4.flowi4_oif = vrf_master_dev_idx(skb->dev) ? : skb->dev->ifindex; security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) @@ -458,6 +460,8 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; + fl4->flowi4_oif = vrf_master_dev_idx(skb_in->dev) ? : skb_in->dev->ifindex; + security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); rt = __ip_route_output_key(net, fl4); if (IS_ERR(rt)) diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 05ff44b758df..685fada659f5 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -44,6 +44,7 @@ #include <net/route.h> #include <net/inet_common.h> #include <net/checksum.h> +#include <net/vrf.h> #if IS_ENABLED(CONFIG_IPV6) #include <linux/in6.h> @@ -174,7 +175,7 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) struct sock *sk = NULL; struct inet_sock *isk; struct hlist_nulls_node *hnode; - int dif = skb->dev->ifindex; + int dif = vrf_master_dev_idx(skb->dev) ? : skb->dev->ifindex; if (skb->protocol == htons(ETH_P_IP)) { pr_debug("try to find: num = %d, daddr = %pI4, dif = %d\n", diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 561cd4b8fc6e..95ef2834533d 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -72,6 +72,7 @@ #include <net/inet_common.h> #include <net/checksum.h> #include <net/xfrm.h> +#include <net/vrf.h> #include <linux/rtnetlink.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -171,6 +172,7 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) struct hlist_head *head; int delivered = 0; struct net *net; + int idx = vrf_master_dev_idx(skb->dev) ? : skb->dev->ifindex; read_lock(&raw_v4_hashinfo.lock); head = &raw_v4_hashinfo.ht[hash]; @@ -179,8 +181,7 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash) net = dev_net(skb->dev); sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol, - iph->saddr, iph->daddr, - skb->dev->ifindex); + iph->saddr, iph->daddr, idx); while (sk) { delivered = 1; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d0362a2de3d3..c66fdeb3a101 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -109,6 +109,7 @@ #include <linux/kmemleak.h> #endif #include <net/secure_seq.h> +#include <net/vrf.h> #define RT_FL_TOS(oldflp4) \ ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) @@ -1710,7 +1711,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, * Now we are ready to route packet. */ fl4.flowi4_oif = 0; - fl4.flowi4_iif = dev->ifindex; + fl4.flowi4_iif = vrf_master_dev_idx(dev) ? : dev->ifindex; fl4.flowi4_mark = skb->mark; fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; @@ -2089,6 +2090,9 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) if (!dev_out) goto out; + if (netif_is_vrf(dev_out)) + fl4->flowi4_flags |= FLOWI_FLAG_VRFSRC; + /* RACE: Check return value of inet_select_addr instead. */ if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { rth = ERR_PTR(-ENETUNREACH); @@ -2273,8 +2277,12 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, struct sock *sk) { - struct rtable *rt = __ip_route_output_key(net, flp4); + struct rtable *rt; + + if (netif_idx_is_vrf(net, flp4->flowi4_oif)) + flp4->flowi4_flags |= FLOWI_FLAG_VRFSRC; + rt = __ip_route_output_key(net, flp4); if (IS_ERR(rt)) return rt; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index d70b1f603692..120f4406ba7a 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -348,7 +348,9 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; treq->tfo_listener = false; - ireq->ir_iif = sk->sk_bound_dev_if; + ireq->ir_iif = vrf_get_master_dev_idx(sock_net(sk), skb->skb_iif); + if (!ireq->ir_iif) + ireq->ir_iif = sk->sk_bound_dev_if; /* We throwed the options of the initial SYN away, so we hope * the ACK carries the same options again (see RFC1122 4.2.3.8) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 684f095d196e..3018b4f795eb 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -72,6 +72,7 @@ #include <net/dst.h> #include <net/tcp.h> #include <net/inet_common.h> +#include <net/vrf.h> #include <linux/ipsec.h> #include <asm/unaligned.h> #include <linux/errqueue.h> @@ -6138,7 +6139,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_openreq_init(req, &tmp_opt, skb, sk); /* Note: tcp_v6_init_req() might override ir_iif for link locals */ - inet_rsk(req)->ir_iif = sk->sk_bound_dev_if; + inet_rsk(req)->ir_iif = vrf_get_master_dev_idx(sock_net(sk), + skb->skb_iif); + if (!inet_rsk(req)->ir_iif) + inet_rsk(req)->ir_iif = sk->sk_bound_dev_if; af_ops->init_req(req, sk, skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d7d4c2b79cf2..c03e28477275 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -682,6 +682,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) */ if (sk) arg.bound_dev_if = sk->sk_bound_dev_if; + if (!arg.bound_dev_if) + arg.bound_dev_if = vrf_master_dev_idx(skb_dst(skb)->dev); arg.tos = ip_hdr(skb)->tos; ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), @@ -766,8 +768,8 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, ip_hdr(skb)->saddr, /* XXX */ arg.iov[0].iov_len, IPPROTO_TCP, 0); arg.csumoffset = offsetof(struct tcphdr, check) / 2; - if (oif) - arg.bound_dev_if = oif; + arg.bound_dev_if = oif ? : vrf_master_dev_idx(skb_dst(skb)->dev); + arg.tos = tos; ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), skb, &TCP_SKB_CB(skb)->header.h4.opt, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 83aa604f9273..cf706d7898a2 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -501,6 +501,8 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, int score, badness, matches = 0, reuseport = 0; u32 hash = 0; + dif = vrf_get_master_dev_idx(net, dif) ? : dif; + rcu_read_lock(); if (hslot->count > 10) { hash2 = udp4_portaddr_hash(net, daddr, hnum); -- 2.3.2 (Apple Git-55) -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html