This patch adds support for checkpointing and restoring route information.
It keeps enough information to restore basic routes at the level of detail
of /proc/net/route.  It uses RTNETLINK to extract the information during
checkpoint and also to insert it back during restore.  This gives us a
nice layer of isolation between us and the various "fib" implementations.

Signed-off-by: Dan Smith <da...@us.ibm.com>
---
 include/linux/checkpoint_hdr.h |   31 +++
 net/checkpoint_dev.c           |  412 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 442 insertions(+), 1 deletions(-)

diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 633c9b0..187d706 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -23,6 +23,7 @@
 #include <sys/un.h>
 #include <netinet/in.h>
 #endif
+#include <linux/if.h>
 
 /*
  * /usr/include/linux/security.h is not exported to userspace, so
@@ -783,6 +784,7 @@ struct ckpt_hdr_file_socket {
 struct ckpt_hdr_netns {
        struct ckpt_hdr h;
        __s32 this_ref;
+       __u32 routes;
 } __attribute__((aligned(8)));
 
 enum ckpt_netdev_types {
@@ -837,6 +839,35 @@ struct ckpt_netdev_addr {
        } __attribute__((aligned(8)));
 } __attribute__((aligned(8)));
 
+enum ckpt_route_types {
+       CKPT_ROUTE_IPV4,
+       CKPT_ROUTE_IPV6,
+       CKPT_ROUTE_MAX
+};
+
+#define CKPT_ROUTE_FLAG_GW 1
+
+struct ckpt_route {
+       __u16 type;
+       __u16 flags;
+
+       union {
+               struct {
+                       __be32 inet4_len;          /* mask length (bits) */
+                       __u32  inet4_met;          /* metric             */
+                       __be32 inet4_dst;          /* route address      */
+                       __be32 inet4_gwy;          /* gateway address    */
+               };
+               struct {
+                       __u32 inet6_len;           /* mask length (bits) */
+                       __u32 inet6_met;           /* metric             */
+                       struct in6_addr inet6_dst; /* route address      */
+                       struct in6_addr inet6_gwy; /* gateway address    */
+               };
+       } __attribute__((aligned(8)));
+       char dev[IFNAMSIZ+1];
+} __attribute__((aligned(8)));
+
 struct ckpt_hdr_eventpoll_items {
        struct ckpt_hdr h;
        __s32  epfile_objref;
diff --git a/net/checkpoint_dev.c b/net/checkpoint_dev.c
index df8b16a..b34d1f2 100644
--- a/net/checkpoint_dev.c
+++ b/net/checkpoint_dev.c
@@ -17,9 +17,11 @@
 #include <linux/checkpoint_hdr.h>
 #include <linux/deferqueue.h>
 #include <linux/module.h>
+#include <linux/fib_rules.h>
 
 #include <net/net_namespace.h>
 #include <net/sch_generic.h>
+#include <net/ipv6.h>
 
 struct veth_newlink {
        char *peer;
@@ -107,6 +109,22 @@ static int __kern_dev_ioctl(struct net *net, unsigned int 
cmd, void *arg)
        return ret;
 }
 
+static void debug_route(struct ckpt_route *route)
+{
+       if (route->type == CKPT_ROUTE_IPV4)
+               ckpt_debug("inet4 route %pI4/%i gw %pI4 metric %i dev %s\n",
+                          &route->inet4_dst, route->inet4_len,
+                          &route->inet4_gwy, route->inet4_met,
+                          route->dev);
+       else if (route->type == CKPT_ROUTE_IPV6)
+               ckpt_debug("inet6 route %pI6/%i gw %pI6 metric %i dev %s\n",
+                          &route->inet6_dst, route->inet6_len,
+                          &route->inet6_gwy, route->inet6_met,
+                          route->dev);
+       else
+               ckpt_debug("unknown route type %i\n", route->type);
+}
+
 static struct socket *rtnl_open(struct net *net)
 {
        struct socket *sock;
@@ -313,11 +331,236 @@ int checkpoint_netdev(struct ckpt_ctx *ctx, void *ptr)
        return ret;
 }
 
+static int rtnl_dump_routes(struct socket *rtnl, int family)
+{
+       struct sk_buff *skb;
+       struct rtmsg *rtm;
+       int flags = NLM_F_ROOT | NLM_F_REQUEST;
+       struct msghdr msg;
+       struct kvec kvec;
+       struct nlmsghdr *nlh;
+       int ret = -ENOMEM;
+
+       skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+       if (!skb)
+               return -ENOMEM;
+
+       nlh = nlmsg_put(skb, 0, 0, RTM_GETROUTE, sizeof(*rtm), flags);
+       if (!nlh)
+               goto out;
+
+       rtm = nlmsg_data(nlh);
+       memset(rtm, 0, sizeof(*rtm));
+       rtm->rtm_family = family;
+
+       nlmsg_end(skb, nlh);
+
+       memset(&msg, 0, sizeof(msg));
+       kvec.iov_len = skb->len;
+       kvec.iov_base = skb->head;
+
+       ret = kernel_sendmsg(rtnl, &msg, &kvec, 1, kvec.iov_len);
+       if ((ret >= 0) && (ret != skb->len))
+               ret = -EIO;
+ out:
+       kfree_skb(skb);
+       return ret;
+}
+
+static int rtnl_process_inet4_route(struct net *net,
+                                   struct rtmsg *rtm,
+                                   struct nlattr **tb,
+                                   struct ckpt_route *route)
+{
+       if (rtm->rtm_type != RTN_UNICAST)
+               return 0; /* skip non-unicast routes */
+
+       route->type = CKPT_ROUTE_IPV4;
+       route->inet4_len = rtm->rtm_dst_len;
+
+       if (tb[RTA_DST])
+               route->inet4_dst = htonl(nla_get_u32(tb[RTA_DST]));
+       if (tb[RTA_GATEWAY]) {
+               route->flags |= CKPT_ROUTE_FLAG_GW;
+               route->inet4_gwy = htonl(nla_get_u32(tb[RTA_GATEWAY]));
+       }
+       if (tb[RTA_PRIORITY])
+               route->inet4_met = nla_get_u32(tb[RTA_PRIORITY]);
+
+       if (tb[RTA_OIF]) {
+               struct net_device *dev;
+
+               dev = dev_get_by_index(net, nla_get_u32(tb[RTA_OIF]));
+               if (dev) {
+                       strncpy(route->dev, dev->name, IFNAMSIZ);
+                       dev_put(dev);
+               }
+       }
+
+       debug_route(route);
+
+       return 1; /* save this route */
+}
+
+static int rtnl_process_inet6_route(struct net *net,
+                                   struct rtmsg *rtm,
+                                   struct nlattr **tb,
+                                   struct ckpt_route *route)
+{
+       if (rtm->rtm_type != RTN_UNICAST)
+               return 0; /* skip non-unicast routes */
+
+       route->type = CKPT_ROUTE_IPV6;
+       route->inet6_len = rtm->rtm_dst_len;
+
+       if (tb[RTA_DST])
+               ipv6_addr_copy(&route->inet6_dst, nla_data(tb[RTA_DST]));
+       if (tb[RTA_GATEWAY]) {
+               route->flags |= CKPT_ROUTE_FLAG_GW;
+               ipv6_addr_copy(&route->inet6_gwy, nla_data(tb[RTA_GATEWAY]));
+       }
+       if (tb[RTA_PRIORITY])
+               route->inet6_met = nla_get_u32(tb[RTA_PRIORITY]);
+
+       if (tb[RTA_OIF]) {
+               struct net_device *dev;
+
+               dev = dev_get_by_index(net, nla_get_u32(tb[RTA_OIF]));
+               if (dev) {
+                       strncpy(route->dev, dev->name, IFNAMSIZ);
+                       dev_put(dev);
+               }
+       }
+
+       debug_route(route);
+
+       return 1;
+}
+
+static int rtnl_process_routes(struct net *net,
+                              struct nlmsghdr *nlh, int len,
+                              struct ckpt_route *routes,
+                              int idx, int max)
+{
+       struct nlmsghdr *i;
+
+       for (i = nlh; NLMSG_OK(i, len); i = NLMSG_NEXT(i, len)) {
+               struct ckpt_route *route = &routes[idx];
+               struct rtmsg *rtm = NLMSG_DATA(i);
+               struct nlattr *tb[FRA_MAX+1];
+               int ret;
+
+               if (idx >= max)
+                       return -E2BIG;
+
+               if (i->nlmsg_type == NLMSG_DONE)
+                       break;
+               else if (nlh->nlmsg_type != RTM_NEWROUTE) {
+                       struct nlmsgerr *errmsg = nlmsg_data(nlh);
+                       return errmsg->error;
+               }
+
+               ret = nlmsg_parse(i, sizeof(*rtm), tb, FRA_MAX, NULL);
+               if (ret < 0)
+                       return ret;
+
+               memset(route, 0, sizeof(*route));
+
+               if (rtm->rtm_family == AF_INET)
+                       ret = rtnl_process_inet4_route(net, rtm, tb, route);
+               else if (rtm->rtm_family == AF_INET6)
+                       ret = rtnl_process_inet6_route(net, rtm, tb, route);
+               else
+                       ret = 0; /* skip */
+               if (ret < 0)
+                       return ret;
+               else if (ret)
+                       idx += 1;
+       }
+
+       return idx;
+}
+
+static int rtnl_get_routes(struct net *net, int family,
+                          struct ckpt_route *routes, int idx, int max)
+{
+       int ret;
+       long timeo = MAX_SCHEDULE_TIMEOUT;
+       struct nlmsghdr *nlh;
+       struct sk_buff *skb = NULL;
+       struct socket *rtnl = NULL;
+
+       rtnl = rtnl_open(net);
+       if (IS_ERR(rtnl))
+               return PTR_ERR(rtnl);
+
+       ret = rtnl_dump_routes(rtnl, family);
+       if (ret < 0)
+               goto out;
+
+       lock_sock(rtnl->sk);
+       ret = sk_wait_data(rtnl->sk, &timeo);
+       if (ret)
+               skb = skb_dequeue(&rtnl->sk->sk_receive_queue);
+       release_sock(rtnl->sk);
+       if (!skb) {
+               ret = -EIO;
+               goto out;
+       }
+
+       nlh = nlmsg_hdr(skb);
+       if (!nlh) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       ret = rtnl_process_routes(net, nlh, skb->len, routes, idx, max);
+ out:
+       rtnl_close(rtnl);
+       kfree_skb(skb);
+       return ret;
+}
+
+int checkpoint_netns_routes(struct ckpt_ctx *ctx, struct net *net,
+                           struct ckpt_route **_routes)
+{
+       struct ckpt_route *routes = NULL;
+       int max = 32;
+       int idx;
+       int families[] = {AF_INET, AF_INET6, 0};
+       int family;
+ retry:
+       idx = 0;
+       kfree(routes);
+       routes = kmalloc(max * sizeof(*routes), GFP_KERNEL);
+       if (!routes)
+               return -ENOMEM;
+
+       for (family = 0; families[family]; family++) {
+               idx = rtnl_get_routes(net, families[family], routes, idx, max);
+               if (idx == -E2BIG) {
+                       max *= 2;
+                       goto retry;
+               } else if (idx < 0)
+                       break;
+       }
+
+       if (idx < 0) {
+               kfree(routes);
+               routes = NULL;
+               ckpt_err(ctx, idx, "error saving routes\n");
+       }
+       *_routes = routes;
+
+       return idx;
+}
+
 int checkpoint_netns(struct ckpt_ctx *ctx, void *ptr)
 {
        struct net *net = ptr;
        struct net_device *dev;
        struct ckpt_hdr_netns *h;
+       struct ckpt_route *routes = NULL;
        int ret;
 
        h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_NET_NS);
@@ -327,10 +570,19 @@ int checkpoint_netns(struct ckpt_ctx *ctx, void *ptr)
        h->this_ref = ckpt_obj_lookup(ctx, net, CKPT_OBJ_NET_NS);
        BUG_ON(h->this_ref <= 0);
 
+       ret = checkpoint_netns_routes(ctx, net, &routes);
+       if (ret < 0)
+               goto out;
+       h->routes = ret;
+
        ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
        if (ret < 0)
                goto out;
 
+       ret = ckpt_write_buffer(ctx, routes, h->routes * sizeof(*routes));
+       if (ret < 0)
+               goto out;
+
        for_each_netdev(net, dev) {
                if (dev->netdev_ops->ndo_checkpoint)
                        ret = checkpoint_obj(ctx, dev, CKPT_OBJ_NETDEV);
@@ -347,6 +599,7 @@ int checkpoint_netns(struct ckpt_ctx *ctx, void *ptr)
        }
  out:
        ckpt_hdr_put(ctx, h);
+       kfree(routes);
 
        return ret;
 }
@@ -862,10 +1115,145 @@ void *restore_netdev(struct ckpt_ctx *ctx)
        return dev;
 }
 
+static int rtnl_restore_route(struct net *net, struct ckpt_route *route)
+{
+       struct sk_buff *skb;
+       struct rtmsg *rtm;
+       int flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_ACK;
+       struct nlmsghdr *nlh;
+       int ret = 0;
+
+       skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+       if (!skb)
+               return -ENOMEM;
+
+       nlh = nlmsg_put(skb, 0, 0, RTM_NEWROUTE, sizeof(*rtm), flags);
+       if (!nlh) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       rtm = nlmsg_data(nlh);
+       memset(rtm, 0, sizeof(*rtm));
+
+       rtm->rtm_table = RT_TABLE_MAIN;
+       rtm->rtm_protocol = RTPROT_BOOT;
+       rtm->rtm_scope = RT_SCOPE_UNIVERSE;
+       rtm->rtm_type = RTN_UNICAST;
+
+       if (route->dev[0]) {
+               struct net_device *dev;
+
+               dev = dev_get_by_name(net, route->dev);
+               if (!dev) {
+                       ckpt_debug("unable to find dev %s for route\n",
+                                  route->dev);
+                       ret = -EINVAL;
+                       goto out;
+               }
+               nla_put_u32(skb, RTA_OIF, dev->ifindex);
+               dev_put(dev);
+       }
+
+       if (route->type == CKPT_ROUTE_IPV4) {
+               rtm->rtm_family = AF_INET;
+               rtm->rtm_dst_len = route->inet4_len;
+
+               nla_put_u32(skb, RTA_DST, route->inet4_dst);
+               if (route->flags & CKPT_ROUTE_FLAG_GW)
+                       nla_put_u32(skb, RTA_GATEWAY, route->inet4_gwy);
+               nla_put_u32(skb, RTA_PRIORITY, route->inet4_met);
+       } else if (route->type == CKPT_ROUTE_IPV6) {
+               int len = sizeof(route->inet6_dst);
+
+               if (ipv6_addr_scope(&route->inet6_dst))
+                       goto out; /* Skip non-global scope routes */
+
+               rtm->rtm_family = AF_INET6;
+               rtm->rtm_dst_len = route->inet6_len;
+
+               nla_put(skb, RTA_DST, len, &route->inet6_dst);
+               if (route->flags & CKPT_ROUTE_FLAG_GW)
+                       nla_put(skb, RTA_GATEWAY, len, &route->inet6_gwy);
+               nla_put_u32(skb, RTA_PRIORITY, route->inet6_met);
+       } else {
+               ckpt_debug("unsupported route type %i\n", route->type);
+               ret = -EINVAL;
+               goto out;
+       }
+
+       nlmsg_end(skb, nlh);
+
+       debug_route(route);
+
+       ret = rtnl_do(net, skb);
+ out:
+       kfree_skb(skb);
+       return ret;
+}
+
+static int restore_routes(struct net *net, struct ckpt_route *routes, int 
count)
+{
+       int i;
+       int ret = 0;
+
+       for (i = 0; i < count; i++) {
+               struct ckpt_route *route = &routes[i];
+
+               ret = rtnl_restore_route(net, route);
+               if (ret == -EEXIST)
+                       /* Some routes have been implied by device addresses */
+                       continue;
+               else if (ret < 0)
+                       break;
+       }
+
+       return ret;
+}
+
+struct dq_routes {
+       struct ckpt_ctx *ctx;
+       struct net *net;
+       struct ckpt_route *routes;
+       int count;
+};
+
+static int deferred_restore_routes(void *data)
+{
+       struct dq_routes *dq = data;
+       int ret;
+
+       ret = restore_routes(dq->net, dq->routes, dq->count);
+       if (ret < 0)
+               ckpt_err(dq->ctx, ret, "failed to restore routes\n");
+
+       kfree(dq->routes);
+
+       return ret;
+}
+
+static int defer_restore_routes(struct ckpt_ctx *ctx,
+                               struct net *net,
+                               struct ckpt_route *routes,
+                               int count)
+{
+       struct dq_routes dq;
+
+       dq.ctx = ctx;
+       dq.net = net;
+       dq.routes = routes;
+       dq.count = count;
+
+       return deferqueue_add(ctx->files_deferq, &dq, sizeof(dq),
+                             deferred_restore_routes, NULL);
+}
+
 void *restore_netns(struct ckpt_ctx *ctx)
 {
        struct ckpt_hdr_netns *h;
        struct net *net;
+       struct ckpt_route *routes = NULL;
+       int ret;
 
        h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_NET_NS);
        if (IS_ERR(h)) {
@@ -873,12 +1261,34 @@ void *restore_netns(struct ckpt_ctx *ctx)
                return h;
        }
 
+       ret = ckpt_read_payload(ctx, (void **)&routes,
+                               h->routes * sizeof(*routes), CKPT_HDR_BUFFER);
+       if (ret < 0) {
+               ckpt_err(ctx, ret, "Unable to read routes buffer\n");
+               net = ERR_PTR(ret);
+               goto out;
+       }
+
        if (h->this_ref != 0) {
                net = copy_net_ns(CLONE_NEWNET, current->nsproxy->net_ns);
                if (IS_ERR(net))
                        goto out;
-       } else
+
+               ret = defer_restore_routes(ctx, net, routes, h->routes);
+               if (ret < 0) {
+                       kfree(routes);
+                       put_net(net);
+                       net = ERR_PTR(ret);
+               }
+       } else {
+               if (h->routes) {
+                       net = ERR_PTR(-EINVAL);
+                       ckpt_err(ctx, -EINVAL,
+                                "Parent netns claims to have routes\n");
+                       goto out;
+               }
                net = current->nsproxy->net_ns;
+       }
  out:
        ckpt_hdr_put(ctx, h);
 
-- 
1.6.2.5

_______________________________________________
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

_______________________________________________
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel

Reply via email to