This patch adds basic support for C/R of open INET sockets.  I think that
all the important bits of the TCP and ICSK socket structures is saved,
but I think there is still some additional IPv6 stuff that needs to be
handled.

With this patch applied, the following script can be used to demonstrate
the functionality:

  
https://lists.linux-foundation.org/pipermail/containers/2009-October/021239.html

It shows that this enables migration of a sendmail process with open
connections from one machine to another without dropping.

Now that listening socket support is in the c/r tree, I think it is
a good time to start fielding comments and suggestions on the
connected part, as I think lots of folks have input on how to make it
better, safer, etc.

Cc: [email protected]
Cc: Oren Laadan <[email protected]>
Cc: John Dykstra <[email protected]>
Signed-off-by: Dan Smith <[email protected]>
---
 checkpoint/sys.c                 |    4 +
 include/linux/checkpoint_hdr.h   |   97 +++++++++++++++++++
 include/linux/checkpoint_types.h |    2 +
 net/checkpoint.c                 |   25 ++----
 net/ipv4/checkpoint.c            |  192 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 303 insertions(+), 17 deletions(-)

diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index 260a1ee..4ec4dd9 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -221,6 +221,8 @@ static void ckpt_ctx_free(struct ckpt_ctx *ctx)
 
        kfree(ctx->pids_arr);
 
+       sock_list_free(&ctx->listen_sockets);
+
        kfree(ctx);
 }
 
@@ -249,6 +251,8 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned 
long uflags,
        spin_lock_init(&ctx->lock);
 #endif
 
+       INIT_LIST_HEAD(&ctx->listen_sockets);
+
        err = -EBADF;
        ctx->file = fget(fd);
        if (!ctx->file)
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index b5f958e..2693a5d 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -16,6 +16,7 @@
 #include <linux/socket.h>
 #include <linux/un.h>
 #include <linux/in.h>
+#include <linux/in6.h>
 #else
 #include <sys/socket.h>
 #include <sys/un.h>
@@ -475,6 +476,102 @@ struct ckpt_hdr_socket_unix {
 
 struct ckpt_hdr_socket_inet {
        struct ckpt_hdr h;
+       __u32 daddr;
+       __u32 rcv_saddr;
+       __u32 saddr;
+       __u16 dport;
+       __u16 num;
+       __u16 sport;
+       __s16 uc_ttl;
+       __u16 cmsg_flags;
+
+       struct {
+               __u64 timeout;
+               __u32 ato;
+               __u32 lrcvtime;
+               __u16 last_seg_size;
+               __u16 rcv_mss;
+               __u8 pending;
+               __u8 quick;
+               __u8 pingpong;
+               __u8 blocked;
+       } icsk_ack __attribute__ ((aligned(8)));
+
+       /* FIXME: Skipped opt, tos, multicast, cork settings */
+
+       struct {
+               __u64 last_synq_overflow;
+
+               __u32 rcv_nxt;
+               __u32 copied_seq;
+               __u32 rcv_wup;
+               __u32 snd_nxt;
+               __u32 snd_una;
+               __u32 snd_sml;
+               __u32 rcv_tstamp;
+               __u32 lsndtime;
+
+               __u32 snd_wl1;
+               __u32 snd_wnd;
+               __u32 max_window;
+               __u32 mss_cache;
+               __u32 window_clamp;
+               __u32 rcv_ssthresh;
+               __u32 frto_highmark;
+
+               __u32 srtt;
+               __u32 mdev;
+               __u32 mdev_max;
+               __u32 rttvar;
+               __u32 rtt_seq;
+
+               __u32 packets_out;
+               __u32 retrans_out;
+
+               __u32 snd_up;
+               __u32 rcv_wnd;
+               __u32 write_seq;
+               __u32 pushed_seq;
+               __u32 lost_out;
+               __u32 sacked_out;
+               __u32 fackets_out;
+               __u32 tso_deferred;
+               __u32 bytes_acked;
+
+               __s32 lost_cnt_hint;
+               __u32 retransmit_high;
+
+               __u32 lost_retrans_low;
+
+               __u32 prior_ssthresh;
+               __u32 high_seq;
+
+               __u32 retrans_stamp;
+               __u32 undo_marker;
+               __s32 undo_retrans;
+               __u32 total_retrans;
+
+               __u32 urg_seq;
+               __u32 keepalive_time;
+               __u32 keepalive_intvl;
+
+               __u16 urg_data;
+               __u16 advmss;
+               __u8 frto_counter;
+               __u8 nonagle;
+
+               __u8 ecn_flags;
+               __u8 reordering;
+
+               __u8 keepalive_probes;
+       } tcp __attribute__ ((aligned(8)));
+
+       struct {
+               struct in6_addr saddr;
+               struct in6_addr rcv_saddr;
+               struct in6_addr daddr;
+       } inet6 __attribute__ ((aligned(8)));
+
        __u32 laddr_len;
        __u32 raddr_len;
        struct sockaddr_in laddr;
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index fa57cdc..91c141b 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -65,6 +65,8 @@ struct ckpt_ctx {
        struct list_head pgarr_list;    /* page array to dump VMA contents */
        struct list_head pgarr_pool;    /* pool of empty page arrays chain */
 
+       struct list_head listen_sockets;/* listening parent sockets */
+
        /* [multi-process checkpoint] */
        struct task_struct **tasks_arr; /* array of all tasks [checkpoint] */
        int nr_tasks;                   /* size of tasks array */
diff --git a/net/checkpoint.c b/net/checkpoint.c
index e7e8e75..3d6da68 100644
--- a/net/checkpoint.c
+++ b/net/checkpoint.c
@@ -90,6 +90,7 @@ static int sock_copy_buffers(struct sk_buff_head *from,
 
 static int __sock_write_buffers(struct ckpt_ctx *ctx,
                                struct sk_buff_head *queue,
+                               uint16_t family,
                                int dst_objref)
 {
        struct sk_buff *skb;
@@ -98,11 +99,7 @@ static int __sock_write_buffers(struct ckpt_ctx *ctx,
                struct ckpt_hdr_socket_buffer *h;
                int ret = 0;
 
-               /* FIXME: This could be a false positive for non-unix
-                *        buffers, so add a type check here in the
-                *        future
-                */
-               if (UNIXCB(skb).fp) {
+               if ((family == AF_UNIX) && UNIXCB(skb).fp) {
                        ckpt_write_err(ctx, "TE", "af_unix: pass fd", -EBUSY);
                        return -EBUSY;
                }
@@ -141,6 +138,7 @@ static int __sock_write_buffers(struct ckpt_ctx *ctx,
 
 static int sock_write_buffers(struct ckpt_ctx *ctx,
                              struct sk_buff_head *queue,
+                             uint16_t family,
                              int dst_objref)
 {
        struct ckpt_hdr_socket_queue *h;
@@ -160,7 +158,7 @@ static int sock_write_buffers(struct ckpt_ctx *ctx,
        h->skb_count = ret;
        ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
        if (!ret)
-               ret = __sock_write_buffers(ctx, &tmpq, dst_objref);
+               ret = __sock_write_buffers(ctx, &tmpq, family, dst_objref);
 
  out:
        ckpt_hdr_put(ctx, h);
@@ -182,12 +180,14 @@ int sock_deferred_write_buffers(void *data)
                return dst_objref;
        }
 
-       ret = sock_write_buffers(ctx, &dq->sk->sk_receive_queue, dst_objref);
+       ret = sock_write_buffers(ctx, &dq->sk->sk_receive_queue,
+                                dq->sk->sk_family, dst_objref);
        ckpt_debug("write recv buffers: %i\n", ret);
        if (ret < 0)
                return ret;
 
-       ret = sock_write_buffers(ctx, &dq->sk->sk_write_queue, dst_objref);
+       ret = sock_write_buffers(ctx, &dq->sk->sk_write_queue,
+                                dq->sk->sk_family, dst_objref);
        ckpt_debug("write send buffers: %i\n", ret);
 
        return ret;
@@ -710,15 +710,6 @@ struct sock *do_sock_restore(struct ckpt_ctx *ctx)
        if (ret < 0)
                goto err;
 
-       if ((h->sock_common.family == AF_INET) &&
-           (h->sock.state != TCP_LISTEN)) {
-               /* Temporary hack to enable restore of TCP_LISTEN sockets
-                * while forcing anything else to a closed state
-                */
-               sock->sk->sk_state = TCP_CLOSE;
-               sock->state = SS_UNCONNECTED;
-       }
-
        ckpt_hdr_put(ctx, h);
        return sock->sk;
  err:
diff --git a/net/ipv4/checkpoint.c b/net/ipv4/checkpoint.c
index 9cbbf5e..0edfa3e 100644
--- a/net/ipv4/checkpoint.c
+++ b/net/ipv4/checkpoint.c
@@ -17,6 +17,7 @@
 #include <linux/deferqueue.h>
 #include <net/tcp_states.h>
 #include <net/tcp.h>
+#include <net/ipv6.h>
 
 struct dq_sock {
        struct ckpt_ctx *ctx;
@@ -28,6 +29,176 @@ struct dq_buffers {
        struct sock *sk;
 };
 
+static int sock_is_parent(struct sock *sk, struct sock *parent)
+{
+       return inet_sk(sk)->sport == inet_sk(parent)->sport;
+}
+
+static struct sock *sock_get_parent(struct ckpt_ctx *ctx, struct sock *sk)
+{
+       return sock_list_find(&ctx->listen_sockets, sk, sock_is_parent);
+}
+
+static int sock_hash_parent(void *data)
+{
+       struct dq_sock *dq = (struct dq_sock *)data;
+       struct sock *parent;
+
+       printk("Doing post-restart hash\n");
+
+       dq->sk->sk_prot->hash(dq->sk);
+
+       parent = sock_get_parent(dq->ctx, dq->sk);
+       if (parent) {
+               inet_sk(dq->sk)->num = ntohs(inet_sk(dq->sk)->sport);
+               local_bh_disable();
+               __inet_inherit_port(parent, dq->sk);
+               local_bh_enable();
+       } else {
+               inet_sk(dq->sk)->num = 0;
+               inet_hash_connect(&tcp_death_row, dq->sk);
+               inet_sk(dq->sk)->num = ntohs(inet_sk(dq->sk)->sport);
+       }
+
+       return 0;
+}
+
+static int sock_defer_hash(struct ckpt_ctx *ctx, struct sock *sock)
+{
+       struct dq_sock dq;
+
+       dq.sk = sock;
+       dq.ctx = ctx;
+
+       return deferqueue_add(ctx->deferqueue, &dq, sizeof(dq),
+                             sock_hash_parent, NULL);
+}
+
+static int sock_inet_tcp_cptrst(struct ckpt_ctx *ctx,
+                               struct tcp_sock *sk,
+                               struct ckpt_hdr_socket_inet *hh,
+                               int op)
+{
+       CKPT_COPY(op, hh->tcp.rcv_nxt, sk->rcv_nxt);
+       CKPT_COPY(op, hh->tcp.copied_seq, sk->copied_seq);
+       CKPT_COPY(op, hh->tcp.rcv_wup, sk->rcv_wup);
+       CKPT_COPY(op, hh->tcp.snd_nxt, sk->snd_nxt);
+       CKPT_COPY(op, hh->tcp.snd_una, sk->snd_una);
+       CKPT_COPY(op, hh->tcp.snd_sml, sk->snd_sml);
+       CKPT_COPY(op, hh->tcp.rcv_tstamp, sk->rcv_tstamp);
+       CKPT_COPY(op, hh->tcp.lsndtime, sk->lsndtime);
+
+       CKPT_COPY(op, hh->tcp.snd_wl1, sk->snd_wl1);
+       CKPT_COPY(op, hh->tcp.snd_wnd, sk->snd_wnd);
+       CKPT_COPY(op, hh->tcp.max_window, sk->max_window);
+       CKPT_COPY(op, hh->tcp.mss_cache, sk->mss_cache);
+       CKPT_COPY(op, hh->tcp.window_clamp, sk->window_clamp);
+       CKPT_COPY(op, hh->tcp.rcv_ssthresh, sk->rcv_ssthresh);
+       CKPT_COPY(op, hh->tcp.frto_highmark, sk->frto_highmark);
+       CKPT_COPY(op, hh->tcp.advmss, sk->advmss);
+       CKPT_COPY(op, hh->tcp.frto_counter, sk->frto_counter);
+       CKPT_COPY(op, hh->tcp.nonagle, sk->nonagle);
+
+       CKPT_COPY(op, hh->tcp.srtt, sk->srtt);
+       CKPT_COPY(op, hh->tcp.mdev, sk->mdev);
+       CKPT_COPY(op, hh->tcp.mdev_max, sk->mdev_max);
+       CKPT_COPY(op, hh->tcp.rttvar, sk->rttvar);
+       CKPT_COPY(op, hh->tcp.rtt_seq, sk->rtt_seq);
+
+       CKPT_COPY(op, hh->tcp.packets_out, sk->packets_out);
+       CKPT_COPY(op, hh->tcp.retrans_out, sk->retrans_out);
+
+       CKPT_COPY(op, hh->tcp.urg_data, sk->urg_data);
+       CKPT_COPY(op, hh->tcp.ecn_flags, sk->ecn_flags);
+       CKPT_COPY(op, hh->tcp.reordering, sk->reordering);
+       CKPT_COPY(op, hh->tcp.snd_up, sk->snd_up);
+
+       CKPT_COPY(op, hh->tcp.keepalive_probes, sk->keepalive_probes);
+
+       CKPT_COPY(op, hh->tcp.rcv_wnd, sk->rcv_wnd);
+       CKPT_COPY(op, hh->tcp.write_seq, sk->write_seq);
+       CKPT_COPY(op, hh->tcp.pushed_seq, sk->pushed_seq);
+       CKPT_COPY(op, hh->tcp.lost_out, sk->lost_out);
+       CKPT_COPY(op, hh->tcp.sacked_out, sk->sacked_out);
+       CKPT_COPY(op, hh->tcp.fackets_out, sk->fackets_out);
+       CKPT_COPY(op, hh->tcp.tso_deferred, sk->tso_deferred);
+       CKPT_COPY(op, hh->tcp.bytes_acked, sk->bytes_acked);
+
+       CKPT_COPY(op, hh->tcp.lost_cnt_hint, sk->lost_cnt_hint);
+       CKPT_COPY(op, hh->tcp.retransmit_high, sk->retransmit_high);
+
+       CKPT_COPY(op, hh->tcp.lost_retrans_low, sk->lost_retrans_low);
+
+       CKPT_COPY(op, hh->tcp.prior_ssthresh, sk->prior_ssthresh);
+       CKPT_COPY(op, hh->tcp.high_seq, sk->high_seq);
+
+       CKPT_COPY(op, hh->tcp.retrans_stamp, sk->retrans_stamp);
+       CKPT_COPY(op, hh->tcp.undo_marker, sk->undo_marker);
+       CKPT_COPY(op, hh->tcp.undo_retrans, sk->undo_retrans);
+       CKPT_COPY(op, hh->tcp.total_retrans, sk->total_retrans);
+
+       CKPT_COPY(op, hh->tcp.urg_seq, sk->urg_seq);
+       CKPT_COPY(op, hh->tcp.keepalive_time, sk->keepalive_time);
+       CKPT_COPY(op, hh->tcp.keepalive_intvl, sk->keepalive_intvl);
+
+       return 0;
+}
+
+static int sock_inet_cptrst(struct ckpt_ctx *ctx,
+                           struct sock *sock,
+                           struct ckpt_hdr_socket_inet *hh,
+                           int op)
+{
+       struct inet_sock *sk = inet_sk(sock);
+       struct inet_connection_sock *icsk = inet_csk(sock);
+       int ret;
+
+       CKPT_COPY(op, hh->daddr, sk->daddr);
+       CKPT_COPY(op, hh->rcv_saddr, sk->rcv_saddr);
+       CKPT_COPY(op, hh->dport, sk->dport);
+       CKPT_COPY(op, hh->num, sk->num);
+       CKPT_COPY(op, hh->saddr, sk->saddr);
+       CKPT_COPY(op, hh->sport, sk->sport);
+       CKPT_COPY(op, hh->uc_ttl, sk->uc_ttl);
+       CKPT_COPY(op, hh->cmsg_flags, sk->cmsg_flags);
+
+       CKPT_COPY(op, hh->icsk_ack.pending, icsk->icsk_ack.pending);
+       CKPT_COPY(op, hh->icsk_ack.quick, icsk->icsk_ack.quick);
+       CKPT_COPY(op, hh->icsk_ack.pingpong, icsk->icsk_ack.pingpong);
+       CKPT_COPY(op, hh->icsk_ack.blocked, icsk->icsk_ack.blocked);
+       CKPT_COPY(op, hh->icsk_ack.ato, icsk->icsk_ack.ato);
+       CKPT_COPY(op, hh->icsk_ack.timeout, icsk->icsk_ack.timeout);
+       CKPT_COPY(op, hh->icsk_ack.lrcvtime, icsk->icsk_ack.lrcvtime);
+       CKPT_COPY(op,
+                 hh->icsk_ack.last_seg_size, icsk->icsk_ack.last_seg_size);
+       CKPT_COPY(op, hh->icsk_ack.rcv_mss, icsk->icsk_ack.rcv_mss);
+
+       if (sock->sk_protocol == IPPROTO_TCP)
+               ret = sock_inet_tcp_cptrst(ctx, tcp_sk(sock), hh, op);
+       else if (sock->sk_protocol == IPPROTO_UDP)
+               ret = 0;
+       else {
+               ckpt_write_err(ctx, "T", "unknown socket protocol %d",
+                              sock->sk_protocol);
+               ret = -EINVAL;
+       }
+
+       if (sock->sk_family == AF_INET6) {
+               struct ipv6_pinfo *inet6 = inet6_sk(sock);
+               if (op == CKPT_CPT) {
+                       ipv6_addr_copy(&hh->inet6.saddr, &inet6->saddr);
+                       ipv6_addr_copy(&hh->inet6.rcv_saddr, &inet6->rcv_saddr);
+                       ipv6_addr_copy(&hh->inet6.daddr, &inet6->daddr);
+               } else {
+                       ipv6_addr_copy(&inet6->saddr, &hh->inet6.saddr);
+                       ipv6_addr_copy(&inet6->rcv_saddr, &hh->inet6.rcv_saddr);
+                       ipv6_addr_copy(&inet6->daddr, &hh->inet6.daddr);
+               }
+       }
+
+       return ret;
+}
+
 int inet_checkpoint(struct ckpt_ctx *ctx, struct socket *sock)
 {
        struct ckpt_hdr_socket_inet *in;
@@ -43,6 +214,10 @@ int inet_checkpoint(struct ckpt_ctx *ctx, struct socket 
*sock)
        if (ret)
                goto out;
 
+       ret = sock_inet_cptrst(ctx, sock->sk, in, CKPT_CPT);
+       if (ret < 0)
+               goto out;
+
        ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) in);
  out:
        ckpt_hdr_put(ctx, in);
@@ -209,8 +384,25 @@ int inet_restore(struct ckpt_ctx *ctx,
                        ckpt_debug("inet listen: %i\n", ret);
                        if (ret < 0)
                                goto out;
+
+                       ret = sock_list_add(&ctx->listen_sockets, sock->sk);
+                       if (ret < 0)
+                               goto out;
                }
        } else {
+               ret = sock_inet_cptrst(ctx, sock->sk, in, CKPT_RST);
+               printk("sock_inet_cptrst: %i\n", ret);
+               if (ret)
+                       goto out;
+
+               if ((h->sock.state == TCP_ESTABLISHED) &&
+                   (h->sock.protocol == IPPROTO_TCP)) {
+                       /* Delay hashing this sock until the end so we can
+                        * hook it up with its parent (if appropriate)
+                        */
+                       ret = sock_defer_hash(ctx, sock->sk);
+               }
+
                if (!sock_flag(sock->sk, SOCK_DEAD))
                        ret = inet_defer_restore_buffers(ctx, sock->sk);
        }
-- 
1.6.2.5

_______________________________________________
Containers mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/containers

_______________________________________________
Devel mailing list
[email protected]
https://openvz.org/mailman/listinfo/devel

Reply via email to