This patch adds basic checkpoint/restart support for AF_UNIX sockets.  It
has been tested with a single and multiple processes, and with data inflight
at the time of checkpoint.  It supports both socketpair()s and path-based
sockets.

I have an almost-working AF_INET follow-on to this which I can submit after
this is reviewed and tweaked into acceptance.

Signed-off-by: Dan Smith <[email protected]>
---
 checkpoint/files.c             |    7 +
 checkpoint/objhash.c           |   27 +++
 include/linux/checkpoint_hdr.h |   71 ++++++++
 include/net/sock.h             |    8 +
 net/Makefile                   |    2 +
 net/socket.c                   |   58 ++++++
 net/socket_cr.c                |  378 ++++++++++++++++++++++++++++++++++++++++
 7 files changed, 551 insertions(+), 0 deletions(-)
 create mode 100644 net/socket_cr.c

diff --git a/checkpoint/files.c b/checkpoint/files.c
index b264e40..bb2cca0 100644
--- a/checkpoint/files.c
+++ b/checkpoint/files.c
@@ -21,6 +21,7 @@
 #include <linux/syscalls.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
+#include <net/sock.h>
 
 
 /**************************************************************************
@@ -440,6 +441,12 @@ static struct restore_file_ops restore_file_ops[] = {
                .file_type = CKPT_FILE_PIPE,
                .restore = pipe_file_restore,
        },
+       /* socket */
+       {
+               .file_name = "SOCKET",
+               .file_type = CKPT_FILE_SOCKET,
+               .restore = sock_file_restore,
+       },
 };
 
 static struct file *do_restore_file(struct ckpt_ctx *ctx)
diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c
index 045a920..7819e5e 100644
--- a/checkpoint/objhash.c
+++ b/checkpoint/objhash.c
@@ -19,6 +19,7 @@
 #include <linux/ipc_namespace.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
+#include <net/sock.h>
 
 struct ckpt_obj;
 struct ckpt_obj_ops;
@@ -177,6 +178,22 @@ static int obj_ipc_ns_users(void *ptr)
        return atomic_read(&((struct ipc_namespace *) ptr)->count);
 }
 
+static int obj_sock_grab(void *ptr)
+{
+       sock_hold((struct sock *) ptr);
+       return 0;
+}
+
+static void obj_sock_drop(void *ptr)
+{
+       sock_put((struct sock *) ptr);
+}
+
+static int obj_sock_users(void *ptr)
+{
+       return atomic_read(&((struct sock *) ptr)->sk_refcnt);
+}
+
 static struct ckpt_obj_ops ckpt_obj_ops[] = {
        /* ignored object */
        {
@@ -254,6 +271,16 @@ static struct ckpt_obj_ops ckpt_obj_ops[] = {
                .checkpoint = checkpoint_bad,
                .restore = restore_bad,
        },
+       /* sock object */
+       {
+               .obj_name = "SOCKET",
+               .obj_type = CKPT_OBJ_SOCK,
+               .ref_drop = obj_sock_drop,
+               .ref_grab = obj_sock_grab,
+               .ref_users = obj_sock_users,
+               .checkpoint = sock_file_checkpoint,
+               .restore = sock_file_restore,
+       },
 };
 
 
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index cd427d8..252331a 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -12,6 +12,13 @@
 
 #include <linux/types.h>
 #include <linux/utsname.h>
+#include <linux/socket.h>
+
+/* In userspace, bring in the struct sockaddr_* definitions */
+#ifndef __KERNEL__
+#include <sys/socket.h>
+#include <sys/types.h>
+#endif
 
 /*
  * To maintain compatibility between 32-bit and 64-bit architecture flavors,
@@ -76,6 +83,11 @@ enum {
        CKPT_HDR_IPC_MSG_MSG,
        CKPT_HDR_IPC_SEM,
 
+       CKPT_HDR_FD_SOCKET = 601,
+       CKPT_HDR_SOCKET,
+       CKPT_HDR_SOCKET_BUFFERS,
+       CKPT_HDR_SOCKET_BUFFER,
+
        CKPT_HDR_TAIL = 9001,
 
        CKPT_HDR_ERROR = 9999,
@@ -103,6 +115,7 @@ enum obj_type {
        CKPT_OBJ_NS,
        CKPT_OBJ_UTS_NS,
        CKPT_OBJ_IPC_NS,
+       CKPT_OBJ_SOCK,
        CKPT_OBJ_MAX
 };
 
@@ -225,6 +238,7 @@ enum file_type {
        CKPT_FILE_IGNORE = 0,
        CKPT_FILE_GENERIC,
        CKPT_FILE_PIPE,
+       CKPT_FILE_SOCKET,
        CKPT_FILE_MAX
 };
 
@@ -248,6 +262,11 @@ struct ckpt_hdr_file_pipe {
        __s32 pipe_objref;
 } __attribute__((aligned(8)));
 
+struct ckpt_hdr_file_socket {
+       struct ckpt_hdr_file common;
+       __u16 family;
+} __attribute__((aligned(8)));
+
 struct ckpt_hdr_file_pipe_state {
        struct ckpt_hdr h;
        __s32 pipe_len;
@@ -394,4 +413,56 @@ struct ckpt_hdr_ipc_sem {
 #define CKPT_TST_OVERFLOW_64(a, b) \
        ((sizeof(a) > sizeof(b)) && ((a) > LONG_MAX))
 
+struct ckpt_hdr_socket {
+       struct ckpt_hdr h;
+
+       /* sock_common */
+       __u16 family;
+       __u8 state;
+       __u8 reuse;
+       __u32 bound_dev_if;
+
+       /* sock */
+       __u8 protocol;
+       __u16 type;
+       __u8 sock_state;
+       __u8 shutdown;
+       __u8 userlocks;
+       __u8 no_check;
+       __u32 err;
+       __u32 err_soft;
+       __u32 priority;
+       __u64 rcvlowat;
+       __u64 rcvtimeo;
+       __u64 sndtimeo;
+       __u16 backlog;
+       __s32 rcvbuf;
+       __s32 sndbuf;
+       __u64 flags;
+       __u64 lingertime;
+
+       /* socket */
+       __u64 socket_flags;
+       __u8 socket_state;
+
+       /* common to all supported families */
+       struct sockaddr laddr;
+       struct sockaddr raddr;
+       __u32 laddr_len;
+       __u32 raddr_len;
+
+       union {
+               struct {
+                       __u32 this;
+                       __u32 peer;
+               } un;
+       };
+
+} __attribute__ ((aligned(8)));
+
+struct ckpt_hdr_socket_buffer {
+       struct ckpt_hdr h;
+       __u32 skb_count;
+} __attribute__ ((aligned(8)));
+
 #endif /* _CHECKPOINT_CKPT_HDR_H_ */
diff --git a/include/net/sock.h b/include/net/sock.h
index 4bb1ff9..ced8cd9 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1482,4 +1482,12 @@ extern int sysctl_optmem_max;
 extern __u32 sysctl_wmem_default;
 extern __u32 sysctl_rmem_default;
 
+/* Checkpoint/Restart Functions */
+struct ckpt_ctx;
+struct ckpt_hdr_socket;
+extern int sock_file_checkpoint(struct ckpt_ctx *, void *);
+extern struct socket *__sock_file_restore(struct ckpt_ctx *,
+                                         struct ckpt_hdr_socket *);
+extern void *sock_file_restore(struct ckpt_ctx *);
+
 #endif /* _SOCK_H */
diff --git a/net/Makefile b/net/Makefile
index 9e00a55..1c68a4e 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -65,3 +65,5 @@ ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_SYSCTL)           += sysctl_net.o
 endif
 obj-$(CONFIG_WIMAX)            += wimax/
+
+obj-$(CONFIG_CHECKPOINT)       += socket_cr.o
diff --git a/net/socket.c b/net/socket.c
index 791d71a..d1a187d 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -96,6 +96,9 @@
 #include <net/sock.h>
 #include <linux/netfilter.h>
 
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+
 static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
 static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
                         unsigned long nr_segs, loff_t pos);
@@ -140,6 +143,9 @@ static const struct file_operations socket_file_ops = {
        .sendpage =     sock_sendpage,
        .splice_write = generic_splice_sendpage,
        .splice_read =  sock_splice_read,
+#ifdef CONFIG_CHECKPOINT
+       .checkpoint =   sock_file_checkpoint,
+#endif
 };
 
 /*
@@ -415,6 +421,58 @@ int sock_map_fd(struct socket *sock, int flags)
        return fd;
 }
 
+static struct file *sock_alloc_attach_fd(struct socket *socket)
+{
+       struct file *file;
+       int err;
+
+       file = get_empty_filp();
+       if (!file)
+               return ERR_PTR(ENOMEM);
+
+       err = sock_attach_fd(socket, file, 0);
+       if (err < 0) {
+               put_filp(file);
+               file = ERR_PTR(err);
+       }
+
+       return file;
+}
+
+void *sock_file_restore(struct ckpt_ctx *ctx)
+{
+       struct ckpt_hdr_socket *h = NULL;
+       struct socket *socket = NULL;
+       struct file *file = NULL;
+       int err;
+
+       h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET);
+       if (IS_ERR(h))
+               return h;
+
+       socket = __sock_file_restore(ctx, h);
+       if (IS_ERR(socket)) {
+               err = PTR_ERR(socket);
+               goto err_put;
+       }
+
+       file = sock_alloc_attach_fd(socket);
+       if (IS_ERR(file)) {
+               err = PTR_ERR(file);
+               goto err_release;
+       }
+
+       ckpt_hdr_put(ctx, h);
+
+       return file;
+ err_release:
+       sock_release(socket);
+ err_put:
+       ckpt_hdr_put(ctx, h);
+
+       return ERR_PTR(err);
+}
+
 static struct socket *sock_from_file(struct file *file, int *err)
 {
        if (file->f_op == &socket_file_ops)
diff --git a/net/socket_cr.c b/net/socket_cr.c
new file mode 100644
index 0000000..76759fe
--- /dev/null
+++ b/net/socket_cr.c
@@ -0,0 +1,378 @@
+/*
+ *  Copyright 2009 IBM Corporation
+ *
+ *  Author: Dan Smith <[email protected]>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation, version 2 of the
+ *  License.
+ */
+
+#include <linux/socket.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+
+#include <net/af_unix.h>
+#include <net/tcp_states.h>
+
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+
+static int sock_copy_buffers(struct sk_buff_head *from, struct sk_buff_head 
*to)
+{
+       int count = 0;
+       struct sk_buff *skb;
+
+       spin_lock(&from->lock);
+
+       skb_queue_walk(from, skb) {
+               struct sk_buff *tmp;
+
+               tmp = skb_copy(skb, GFP_KERNEL);
+               if (!tmp) {
+                       count = -ENOMEM;
+                       goto out;
+               }
+               skb_queue_tail(to, tmp);
+               count++;
+       }
+ out:
+       spin_unlock(&from->lock);
+
+       return count;
+}
+
+static int __sock_write_buffers(struct ckpt_ctx *ctx,
+                               struct sk_buff_head *queue)
+{
+       struct sk_buff *skb;
+       int ret = 0;
+
+       skb_queue_walk(queue, skb) {
+               ret = ckpt_write_obj_type(ctx, skb->data, skb->len,
+                                         CKPT_HDR_SOCKET_BUFFER);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+static int sock_write_buffers(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
+{
+       struct ckpt_hdr_socket_buffer *h;
+       struct sk_buff_head tmpq;
+       int ret = -ENOMEM;
+
+       h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFERS);
+       if (!h)
+               goto out;
+
+       skb_queue_head_init(&tmpq);
+
+       h->skb_count = sock_copy_buffers(queue, &tmpq);
+       if (h->skb_count < 0) {
+               ret = h->skb_count;
+               goto out;
+       }
+
+       ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+       if (!ret)
+               ret = __sock_write_buffers(ctx, &tmpq);
+
+ out:
+       ckpt_hdr_put(ctx, h);
+       __skb_queue_purge(&tmpq);
+
+       return ret;
+}
+
+static int sock_un_checkpoint(struct ckpt_ctx *ctx,
+                             struct sock *sock,
+                             struct ckpt_hdr_socket *h)
+{
+       struct unix_sock *sk = unix_sk(sock);
+       struct unix_sock *pr = unix_sk(sk->peer);
+       int new;
+       int ret;
+
+       h->un.this = ckpt_obj_lookup_add(ctx, sk, CKPT_OBJ_SOCK, &new);
+       if (h->un.this < 0)
+               goto out;
+
+       if (sk->peer)
+               h->un.peer = ckpt_obj_lookup_add(ctx, pr, CKPT_OBJ_SOCK, &new);
+       else
+               h->un.peer = 0;
+
+       if (h->un.peer < 0) {
+               ret = h->un.peer;
+               goto out;
+       }
+
+       ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+ out:
+       return ret;
+}
+
+static int sock_cptrst(struct ckpt_ctx *ctx,
+                      struct sock *sock,
+                      struct ckpt_hdr_socket *h,
+                      int op)
+{
+       if (sock->sk_socket) {
+               CKPT_COPY(op, h->socket_flags, sock->sk_socket->flags);
+               CKPT_COPY(op, h->socket_state, sock->sk_socket->state);
+       }
+
+       CKPT_COPY(op, h->reuse, sock->sk_reuse);
+       CKPT_COPY(op, h->shutdown, sock->sk_shutdown);
+       CKPT_COPY(op, h->userlocks, sock->sk_userlocks);
+       CKPT_COPY(op, h->no_check, sock->sk_no_check);
+       CKPT_COPY(op, h->protocol, sock->sk_protocol);
+       CKPT_COPY(op, h->err, sock->sk_err);
+       CKPT_COPY(op, h->err_soft, sock->sk_err_soft);
+       CKPT_COPY(op, h->priority, sock->sk_priority);
+       CKPT_COPY(op, h->rcvlowat, sock->sk_rcvlowat);
+       CKPT_COPY(op, h->backlog, sock->sk_max_ack_backlog);
+       CKPT_COPY(op, h->rcvtimeo, sock->sk_rcvtimeo);
+       CKPT_COPY(op, h->sndtimeo, sock->sk_sndtimeo);
+       CKPT_COPY(op, h->rcvbuf, sock->sk_rcvbuf);
+       CKPT_COPY(op, h->sndbuf, sock->sk_sndbuf);
+       CKPT_COPY(op, h->bound_dev_if, sock->sk_bound_dev_if);
+       CKPT_COPY(op, h->flags, sock->sk_flags);
+       CKPT_COPY(op, h->lingertime, sock->sk_lingertime);
+
+       return 0;
+}
+
+int __sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
+{
+       struct socket *socket = file->private_data;
+       struct sock *sock = socket->sk;
+       struct ckpt_hdr_socket *h;
+       int ret = 0;
+
+       h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET);
+       if (!h)
+               return -ENOMEM;
+
+       h->family = sock->sk_family;
+       h->state = socket->state;
+       h->sock_state = sock->sk_state;
+       h->reuse = sock->sk_reuse;
+       h->type = sock->sk_type;
+       h->protocol = sock->sk_protocol;
+
+       h->laddr_len = sizeof(h->laddr);
+       h->raddr_len = sizeof(h->raddr);
+
+       if (socket->ops->getname(socket, &h->laddr, &h->laddr_len, 0)) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       if ((h->sock_state != TCP_LISTEN) &&
+           (h->type != SOCK_DGRAM) &&
+           (socket->ops->getname(socket, &h->raddr, &h->raddr_len, 1))) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       sock_cptrst(ctx, sock, h, CKPT_CPT);
+
+       if (h->family == AF_UNIX) {
+               ret = sock_un_checkpoint(ctx, sock, h);
+               if (ret)
+                       goto out;
+       } else {
+               ckpt_debug("unsupported socket type %i\n", h->family);
+               ret = EINVAL;
+               goto out;
+       }
+
+       ret = sock_write_buffers(ctx, &sock->sk_receive_queue);
+       if (ret)
+               goto out;
+
+       ret = sock_write_buffers(ctx, &sock->sk_write_queue);
+       if (ret)
+               goto out;
+
+       /* FIXME: write out-of-order queue for TCP */
+ out:
+       ckpt_hdr_put(ctx, h);
+
+       return ret;
+}
+
+static int sock_read_buffer(struct ckpt_ctx *ctx,
+                           struct sock *sock,
+                           struct sk_buff **skb)
+{
+       struct ckpt_hdr *h;
+       int ret = 0;
+       int len;
+
+       h = ckpt_read_buf_type(ctx, SKB_MAX_ALLOC, CKPT_HDR_SOCKET_BUFFER);
+       if (IS_ERR(h))
+               return PTR_ERR(h);
+
+       len = h->len - sizeof(*h);
+
+       *skb = sock_alloc_send_skb(sock, len, MSG_DONTWAIT, &ret);
+       if (*skb == NULL) {
+               ret = ENOMEM;
+               goto out;
+       }
+
+       memcpy(skb_put(*skb, len), (char *)(h + 1), len);
+ out:
+       ckpt_hdr_put(ctx, h);
+       return ret;
+}
+
+static int sock_read_buffers(struct ckpt_ctx *ctx,
+                            struct sock *sock,
+                            struct sk_buff_head *queue)
+{
+       struct ckpt_hdr_socket_buffer *h;
+       int ret = 0;
+       int i;
+
+       h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFERS);
+       if (IS_ERR(h)) {
+               ret = PTR_ERR(h);
+               goto out;
+       }
+
+       for (i = 0; i < h->skb_count; i++) {
+               struct sk_buff *skb = NULL;
+
+               ret = sock_read_buffer(ctx, sock, &skb);
+               if (ret)
+                       break;
+
+               skb_queue_tail(queue, skb);
+       }
+ out:
+       ckpt_hdr_put(ctx, h);
+
+       return ret;
+}
+
+static int sock_un_restart(struct ckpt_ctx *ctx,
+                          struct ckpt_hdr_socket *h,
+                          struct socket *socket)
+{
+       struct sock *peer;
+       int ret = 0;
+
+       if (h->sock_state == TCP_ESTABLISHED) {
+               peer = ckpt_obj_fetch(ctx, h->un.peer, CKPT_OBJ_SOCK);
+               if (peer && !IS_ERR(peer)) {
+                       /* We're last, so join with peer */
+                       struct sock *this = socket->sk;
+
+                       sock_hold(this);
+                       sock_hold(peer);
+
+                       unix_sk(this)->peer = peer;
+                       unix_sk(peer)->peer = this;
+
+                       this->sk_peercred.pid = task_tgid_vnr(current);
+                       current_euid_egid(&this->sk_peercred.uid,
+                                         &this->sk_peercred.gid);
+
+                       peer->sk_peercred.pid = task_tgid_vnr(current);
+                       current_euid_egid(&peer->sk_peercred.uid,
+                                         &peer->sk_peercred.gid);
+               } else {
+                       /* We're first, so add our socket and wait for peer */
+                       ckpt_obj_insert(ctx, socket->sk, h->un.this,
+                                       CKPT_OBJ_SOCK);
+               }
+
+       } else if (h->sock_state == TCP_LISTEN) {
+               ret = socket->ops->bind(socket,
+                                       (struct sockaddr *)&h->laddr,
+                                       h->laddr_len);
+               if (ret < 0)
+                       goto out;
+
+               ret = socket->ops->listen(socket, h->backlog);
+               if (ret < 0)
+                       goto out;
+       } else
+               ckpt_debug("unsupported UNIX socket state %i\n", h->state);
+
+       socket->state = h->state;
+       socket->sk->sk_state = h->sock_state;
+ out:
+       return ret;
+}
+
+struct socket *__sock_file_restore(struct ckpt_ctx *ctx,
+                                  struct ckpt_hdr_socket *h)
+{
+       struct socket *socket;
+       int ret;
+
+       ret = sock_create(h->family, h->type, 0, &socket);
+       if (ret < 0)
+               return ERR_PTR(ret);
+
+       if (h->family == AF_UNIX) {
+               ret = sock_un_restart(ctx, h, socket);
+               ckpt_debug("sock_un_restart: %i\n", ret);
+       } else {
+               ckpt_debug("unsupported family %i\n", h->family);
+               ret = -EINVAL;
+       }
+
+       if (ret)
+               goto out;
+
+       ret = sock_read_buffers(ctx, socket->sk, &socket->sk->sk_receive_queue);
+       if (ret)
+               goto out;
+
+       ret = sock_read_buffers(ctx, socket->sk, &socket->sk->sk_write_queue);
+       if (ret)
+               goto out;
+ out:
+       if (ret) {
+               sock_release(socket);
+               socket = ERR_PTR(ret);
+       }
+
+       return socket;
+}
+
+int sock_file_checkpoint(struct ckpt_ctx *ctx, void *ptr)
+{
+       struct ckpt_hdr_file_socket *h;
+       int ret;
+       struct file *file = ptr;
+
+       h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
+       if (!h)
+               return -ENOMEM;
+
+       h->common.f_type = CKPT_FILE_SOCKET;
+
+       ret = checkpoint_file_common(ctx, file, &h->common);
+       if (ret < 0)
+               goto out;
+       ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+       if (ret < 0)
+               goto out;
+
+       ret = __sock_file_checkpoint(ctx, file);
+ out:
+       ckpt_hdr_put(ctx, h);
+       return ret;
+}
+
+
-- 
1.6.0.4

_______________________________________________
Containers mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/containers

_______________________________________________
Devel mailing list
[email protected]
https://openvz.org/mailman/listinfo/devel

Reply via email to