This patch takes advantage of the newly added lwtunnel framework to
allow the user to set routes that point to a peer netns.

Packets are injected to the peer netns via the loopback device. It works
only when the output device is 'lo'.

Example:
ip route add 40.1.1.1/32 encap netns nsid 5 via dev lo

Signed-off-by: Nicolas Dichtel <nicolas.dich...@6wind.com>
---

v2: rework loopback handling part (update stats and call skb_dst_force())
    fix ipv6 processing
    check lwtunnel type before converting data to a nsid

 drivers/net/loopback.c        | 33 +++++++++++++++++++++------
 include/net/lwtunnel.h        | 27 ++++++++++++++++++++++
 include/uapi/linux/lwtunnel.h |  1 +
 net/core/net_namespace.c      | 52 +++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/route.c              |  9 ++++++--
 5 files changed, 113 insertions(+), 9 deletions(-)

diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index c76283c2f84a..4358256ff94e 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -57,6 +57,7 @@
 #include <linux/percpu.h>
 #include <net/net_namespace.h>
 #include <linux/u64_stats_sync.h>
+#include <net/lwtunnel.h>
 
 struct pcpu_lstats {
        u64                     packets;
@@ -71,29 +72,47 @@ struct pcpu_lstats {
 static netdev_tx_t loopback_xmit(struct sk_buff *skb,
                                 struct net_device *dev)
 {
+       int nsid = skb_lwt_netns_info(skb);
        struct pcpu_lstats *lb_stats;
-       int len;
-
-       skb_orphan(skb);
+       struct net *peernet = NULL;
+       int len, ret;
 
        /* Before queueing this packet to netif_rx(),
         * make sure dst is refcounted.
         */
        skb_dst_force(skb);
 
-       skb->protocol = eth_type_trans(skb, dev);
+       if (nsid != NETNSA_NSID_NOT_ASSIGNED) {
+               peernet = get_net_ns_by_id(dev_net(dev), nsid);
+               if (!peernet) {
+                       kfree_skb(skb);
+                       goto end;
+               }
+
+               /* it's OK to use per_cpu_ptr() because BHs are off */
+               lb_stats = this_cpu_ptr(peernet->loopback_dev->lstats);
+               ret = dev_forward_skb(peernet->loopback_dev, skb);
+       } else {
+               skb_orphan(skb);
 
-       /* it's OK to use per_cpu_ptr() because BHs are off */
-       lb_stats = this_cpu_ptr(dev->lstats);
+               skb->protocol = eth_type_trans(skb, dev);
+
+               /* it's OK to use per_cpu_ptr() because BHs are off */
+               lb_stats = this_cpu_ptr(dev->lstats);
+               ret = netif_rx(skb);
+       }
 
        len = skb->len;
-       if (likely(netif_rx(skb) == NET_RX_SUCCESS)) {
+       if (likely(ret == NET_RX_SUCCESS)) {
                u64_stats_update_begin(&lb_stats->syncp);
                lb_stats->bytes += len;
                lb_stats->packets++;
                u64_stats_update_end(&lb_stats->syncp);
        }
 
+end:
+       if (peernet)
+               put_net(peernet);
        return NETDEV_TX_OK;
 }
 
diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index b02039081b04..78376da1afa2 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -5,7 +5,9 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <linux/types.h>
+#include <linux/net_namespace.h>
 #include <net/route.h>
+#include <net/ip6_fib.h>
 
 #define LWTUNNEL_HASH_BITS   7
 #define LWTUNNEL_HASH_SIZE   (1 << LWTUNNEL_HASH_BITS)
@@ -147,4 +149,29 @@ static inline int lwtunnel_output6(struct sock *sk, struct 
sk_buff *skb)
 
 #endif
 
+static inline u32 *lwt_netns_info(struct lwtunnel_state *lwtstate)
+{
+       return (u32 *)lwtstate->data;
+}
+
+static inline int skb_lwt_netns_info(struct sk_buff *skb)
+{
+       if (skb->protocol == htons(ETH_P_IP)) {
+               struct rtable *rt = (struct rtable *)skb_dst(skb);
+
+               if (rt &&
+                   rt->rt_lwtstate &&
+                   rt->rt_lwtstate->type & LWTUNNEL_ENCAP_NETNS)
+                       return *lwt_netns_info(rt->rt_lwtstate);
+       } else if (skb->protocol == htons(ETH_P_IPV6)) {
+               struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
+
+               if (rt6 &&
+                   rt6->rt6i_lwtstate &&
+                   rt6->rt6i_lwtstate->type & LWTUNNEL_ENCAP_NETNS)
+                       return *lwt_netns_info(rt6->rt6i_lwtstate);
+       }
+
+       return NETNSA_NSID_NOT_ASSIGNED;
+}
 #endif /* __NET_LWTUNNEL_H */
diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index 31377bbea3f8..6715e7a1b335 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -7,6 +7,7 @@ enum lwtunnel_encap_types {
        LWTUNNEL_ENCAP_NONE,
        LWTUNNEL_ENCAP_MPLS,
        LWTUNNEL_ENCAP_IP,
+       LWTUNNEL_ENCAP_NETNS,
        __LWTUNNEL_ENCAP_MAX,
 };
 
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 2c2eb1b629b1..c1267aac373d 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -20,6 +20,7 @@
 #include <net/netlink.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#include <net/lwtunnel.h>
 
 /*
  *     Our network namespace constructor/destructor lists
@@ -725,6 +726,56 @@ out:
        rtnl_set_sk_err(net, RTNLGRP_NSID, err);
 }
 
+static int lwt_netns_build_state(struct net_device *dev, struct nlattr *nla,
+                                struct lwtunnel_state **ts)
+{
+       struct nlattr *tb[NETNSA_MAX + 1];
+       struct lwtunnel_state *newts;
+       int *nsid;
+       int ret;
+
+       ret = nla_parse_nested(tb, NETNSA_MAX, nla, rtnl_net_policy);
+       if (ret < 0)
+               return ret;
+
+       if (!tb[NETNSA_NSID])
+               return -EINVAL;
+
+       newts = lwtunnel_state_alloc(sizeof(*nsid));
+       if (!newts)
+               return -ENOMEM;
+
+       newts->len = sizeof(*nsid);
+       nsid = lwt_netns_info(newts);
+       *nsid = nla_get_s32(tb[NETNSA_NSID]);
+       newts->type = LWTUNNEL_ENCAP_NETNS;
+
+       *ts = newts;
+       return 0;
+}
+
+static int lwt_netns_fill_encap_info(struct sk_buff *skb,
+                                    struct lwtunnel_state *lwtstate)
+{
+       int *nsid = lwt_netns_info(lwtstate);
+
+       if (nla_put_s32(skb, NETNSA_NSID, *nsid))
+               return -ENOMEM;
+
+       return 0;
+}
+
+static int lwt_netns_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+       return nla_total_size(4);       /* NETNSA_NSID */
+}
+
+static const struct lwtunnel_encap_ops lwt_netns_ops = {
+       .build_state = lwt_netns_build_state,
+       .fill_encap = lwt_netns_fill_encap_info,
+       .get_encap_size = lwt_netns_encap_nlsize,
+};
+
 static int __init net_ns_init(void)
 {
        struct net_generic *ng;
@@ -762,6 +813,7 @@ static int __init net_ns_init(void)
        rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid,
                      NULL);
 
+       lwtunnel_encap_add_ops(&lwt_netns_ops, LWTUNNEL_ENCAP_NETNS);
        return 0;
 }
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index c9b2b9fe83fc..894cb18cd8ca 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1796,10 +1796,13 @@ int ip6_route_add(struct fib6_config *cfg)
        rt->rt6i_metric = cfg->fc_metric;
 
        /* We cannot add true routes via loopback here,
-          they would result in kernel looping; promote them to reject routes
+        * they would result in kernel looping; promote them to reject routes.
+        * Exception: routes that point to a peer netns.
         */
        if ((cfg->fc_flags & RTF_REJECT) ||
            (dev && (dev->flags & IFF_LOOPBACK) &&
+            (!rt->rt6i_lwtstate ||
+             rt->rt6i_lwtstate->type != LWTUNNEL_ENCAP_NETNS) &&
             !(addr_type & IPV6_ADDR_LOOPBACK) &&
             !(cfg->fc_flags & RTF_LOCAL))) {
                /* hold loopback dev/idev if we haven't done so. */
@@ -2880,7 +2883,9 @@ static int rt6_fill_node(struct net *net,
        }
        else if (rt->rt6i_flags & RTF_LOCAL)
                rtm->rtm_type = RTN_LOCAL;
-       else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
+       else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK) &&
+               (!rt->rt6i_lwtstate ||
+                rt->rt6i_lwtstate->type != LWTUNNEL_ENCAP_NETNS))
                rtm->rtm_type = RTN_LOCAL;
        else
                rtm->rtm_type = RTN_UNICAST;
-- 
2.4.2

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to