This patch separates o2net and o2quo from knowing about one another as much
 as possible. This is the first in a series of patches that will allow
 userspace cluster interaction. Quorum is separated out first, and will
 ultimately only be associated with the disk heartbeat as a separate module.

 To do so, this patch performs the following changes:
 * o2hb_notify() is added to handle injection of events in a synchronous
   manner. All locking is preserved as expected.
 * disk hearbeat timeouts now inject an event for this node being down. This
   event is handled as special by o2quo which fences the node.
 * o2quo callbacks are now called directly by heartbeat rather than going
   through o2net. Previously, o2net callbacks called o2quo callbacks
   immediately. This ordering is preserved by increasing o2quo's priority over
   o2net.
 * Two new heartbeat event types are added: O2HB_CONN_{UP,DOWN}_CB, which
   correspond to tcp connections being established and terminated.
 * Outside of callbacks, where o2net used to call o2quo functions directly,
   it now injects the O2HB_CONN_{UP,DOWN}_CB events.
 * o2net knowledge of o2quo in header files has been moved to quorum.h
 * o2net's handling of quorum decisions on connection failure has been
   moved to o2quo.
 * o2quo is initialized by the nodemanager rather than by o2net.

 *******
 Unfortunately, this code is actually broken. It will cause a deadlock when
 umounting the last file system due to a deadlock on o2hb_callback_sem.

 Don't actually use this code; It's just posted for review
 *******

 fs/ocfs2/cluster/heartbeat.c    |   14 ++++++
 fs/ocfs2/cluster/heartbeat.h    |    5 ++
 fs/ocfs2/cluster/nodemanager.c  |    3 +
 fs/ocfs2/cluster/quorum.c       |   82 +++++++++++++++++++++++++++++++++++++---
 fs/ocfs2/cluster/quorum.h       |   13 ++----
 fs/ocfs2/cluster/tcp.c          |   36 +++++------------
 fs/ocfs2/cluster/tcp_internal.h |   12 -----
 7 files changed, 117 insertions(+), 48 deletions(-)

Signed-off-by: Jeff Mahoney <[EMAIL PROTECTED]>
diff -ruNpX dontdiff linux-2.6.15-staging1/fs/ocfs2/cluster/heartbeat.c 
linux-2.6.15-staging2/fs/ocfs2/cluster/heartbeat.c
--- linux-2.6.15-staging1/fs/ocfs2/cluster/heartbeat.c  2006-01-08 
18:23:29.376721976 -0500
+++ linux-2.6.15-staging2/fs/ocfs2/cluster/heartbeat.c  2006-01-08 
18:15:23.647564032 -0500
@@ -158,6 +158,7 @@ struct o2hb_bio_wait_ctxt {
 static void o2hb_write_timeout(void *arg)
 {
        struct o2hb_region *reg = arg;
+       struct o2nm_node *node = o2nm_get_node_by_num(o2nm_this_node());
 
        mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
             "milliseconds\n", reg->hr_dev_name,
@@ -588,6 +589,7 @@ static void o2hb_queue_node_event(struct
 {
        assert_spin_locked(&o2hb_live_lock);
 
+       INIT_LIST_HEAD(&event->hn_item);
        event->hn_event_type = type;
        event->hn_node = node;
        event->hn_node_num = node_num;
@@ -598,6 +600,18 @@ static void o2hb_queue_node_event(struct
        list_add_tail(&event->hn_item, &o2hb_node_events);
 }
 
+void o2hb_notify(enum o2hb_callback_type type, struct o2nm_node *node,
+                 int node_num)
+{
+       struct o2hb_node_event event;
+
+       spin_lock(&o2hb_live_lock);
+       o2hb_queue_node_event(&event, type, node, node_num);
+       spin_unlock(&o2hb_live_lock);
+       o2hb_run_event_list(&event);
+}
+EXPORT_SYMBOL_GPL(o2hb_notify);
+
 static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
 {
        struct o2hb_node_event event =
diff -ruNpX dontdiff linux-2.6.15-staging1/fs/ocfs2/cluster/heartbeat.h 
linux-2.6.15-staging2/fs/ocfs2/cluster/heartbeat.h
--- linux-2.6.15-staging1/fs/ocfs2/cluster/heartbeat.h  2006-01-08 
18:23:29.376721976 -0500
+++ linux-2.6.15-staging2/fs/ocfs2/cluster/heartbeat.h  2006-01-08 
18:13:52.643398768 -0500
@@ -46,6 +46,8 @@ extern unsigned int o2hb_dead_threshold;
 enum o2hb_callback_type {
        O2HB_NODE_DOWN_CB = 0,
        O2HB_NODE_UP_CB,
+       O2HB_CONN_DOWN_CB,              /* When a TCP connection fails */
+       O2HB_CONN_UP_CB,                /* When a TCP connection is made */
        O2HB_NUM_CB
 };
 
@@ -78,5 +80,8 @@ int o2hb_check_node_heartbeating(u8 node
 int o2hb_check_node_heartbeating_from_callback(u8 node_num);
 int o2hb_check_local_node_heartbeating(void);
 void o2hb_stop_all_regions(void);
+void o2hb_notify(enum o2hb_callback_type type, struct o2nm_node *node,
+                 int node_num);
+
 
 #endif /* O2CLUSTER_HEARTBEAT_H */
diff -ruNpX dontdiff linux-2.6.15-staging1/fs/ocfs2/cluster/nodemanager.c 
linux-2.6.15-staging2/fs/ocfs2/cluster/nodemanager.c
--- linux-2.6.15-staging1/fs/ocfs2/cluster/nodemanager.c        2006-01-08 
18:23:29.377721824 -0500
+++ linux-2.6.15-staging2/fs/ocfs2/cluster/nodemanager.c        2006-01-08 
18:13:52.644398616 -0500
@@ -27,6 +27,7 @@
 #include "endian.h"
 #include "tcp.h"
 #include "nodemanager.h"
+#include "quorum.h"
 #include "heartbeat.h"
 #include "masklog.h"
 #include "sys.h"
@@ -740,6 +741,7 @@ static void __exit exit_o2nm(void)
        configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
        o2cb_sys_shutdown();
 
+       o2quo_exit();
        o2net_exit();
 }
 
@@ -750,6 +752,7 @@ static int __init init_o2nm(void)
        cluster_print_version();
 
        o2hb_init();
+       o2quo_init();
        o2net_init();
 
        ocfs2_table_header = register_sysctl_table(ocfs2_root_table, 0);
diff -ruNpX dontdiff linux-2.6.15-staging1/fs/ocfs2/cluster/quorum.c 
linux-2.6.15-staging2/fs/ocfs2/cluster/quorum.c
--- linux-2.6.15-staging1/fs/ocfs2/cluster/quorum.c     2006-01-08 
18:23:29.377721824 -0500
+++ linux-2.6.15-staging2/fs/ocfs2/cluster/quorum.c     2006-01-08 
18:17:37.908153320 -0500
@@ -63,8 +63,14 @@ static struct o2quo_state {
        unsigned long           qs_conn_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
        int                     qs_holds;
        unsigned long           qs_hold_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
+       struct work_struct      qs_node_work[O2NM_MAX_NODES];
 } o2quo_state;
 
+static struct o2hb_callback_func o2quo_hb_up_cb, o2quo_hb_down_cb;
+static struct o2hb_callback_func o2quo_hb_conn_up, o2quo_hb_conn_down;
+#define O2QUO_HB_PRI 0x1
+#define O2QUO_DELAY_MS   ((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS)
+
 /* this is horribly heavy-handed.  It should instead flip the file
  * system RO and call some userspace script. */
 static void o2quo_fence_self(void)
@@ -184,7 +190,7 @@ static void o2quo_clear_hold(struct o2qu
  * the connection.  the hold will be droped in conn_up or hb_down.  it might be
  * perpetuated by con_err until hb_down.  if we already have a conn, we might
  * be dropping a hold that conn_up got. */
-void o2quo_hb_up(u8 node)
+void o2quo_hb_up(struct o2nm_node *_node, int node, void *data)
 {
        struct o2quo_state *qs = &o2quo_state;
 
@@ -208,7 +214,7 @@ void o2quo_hb_up(u8 node)
 
 /* hb going down releases any holds we might have had due to this node from
  * conn_up, conn_err, or hb_up */
-void o2quo_hb_down(u8 node)
+void o2quo_hb_down(struct o2nm_node *_node, int node, void *data)
 {
        struct o2quo_state *qs = &o2quo_state;
 
@@ -226,6 +237,8 @@ void o2quo_hb_down(u8 node)
        o2quo_clear_hold(qs, node);
 
        spin_unlock(&qs->qs_lock);
+
+       cancel_delayed_work(&qs->qs_node_work[node]);
 }
 
 /* this tells us that we've decided that the node is still heartbeating
@@ -233,9 +246,10 @@ void o2quo_hb_down(u8 node)
  * and indicates that we must now make a quorum decision in the future,
  * though we might be doing so after waiting for holds to drain.  Here
  * we'll be dropping the hold from conn_err. */
-void o2quo_hb_still_up(u8 node)
+void o2quo_hb_still_up(void *arg)
 {
        struct o2quo_state *qs = &o2quo_state;
+       u8 node = (u8)(long)arg;
 
        spin_lock(&qs->qs_lock);
 
@@ -252,7 +266,7 @@ void o2quo_hb_still_up(u8 node)
  * hb_up or hb_down.  it might be perpetuated by con_err until hb_down.  if
  * it's already heartbeating we we might be dropping a hold that conn_up got.
  * */
-void o2quo_conn_up(u8 node)
+void o2quo_conn_up(struct o2nm_node *_node, int node, void *data)
 {
        struct o2quo_state *qs = &o2quo_state;
 
@@ -278,7 +292,7 @@ void o2quo_conn_up(u8 node)
  * still heartbeating we grab a hold that will delay decisions until either the
  * node stops heartbeating from hb_down or the caller decides that the node is
  * still up and calls still_up */
-void o2quo_conn_err(u8 node)
+void o2quo_conn_err(struct o2nm_node *_node, int node, void *data)
 {
        struct o2quo_state *qs = &o2quo_state;
 
@@ -299,17 +313,78 @@ void o2quo_conn_err(u8 node)
                o2quo_set_hold(qs, node);
 
        spin_unlock(&qs->qs_lock);
+
+       schedule_delayed_work(&qs->qs_node_work[node],
+                             msecs_to_jiffies(O2QUO_DELAY_MS));
 }
 
-void o2quo_init(void)
+static void o2quo_unregister_hb_callbacks(void)
+{
+       int ret;
+
+       ret = o2hb_unregister_callback(&o2quo_hb_conn_up);
+       if (ret < 0)
+               mlog(ML_ERROR, "Status return %d unregistering heartbeat "
+                    "conn up callback!\n", ret);
+
+       ret = o2hb_unregister_callback(&o2quo_hb_conn_down);
+       if (ret < 0)
+               mlog(ML_ERROR, "Status return %d unregistering heartbeat "
+                    "conn down callback!\n", ret);
+       ret = o2hb_unregister_callback(&o2quo_hb_up_cb);
+       if (ret < 0)
+               mlog(ML_ERROR, "Status return %d unregistering heartbeat up "
+                    "callback!\n", ret);
+
+       ret = o2hb_unregister_callback(&o2quo_hb_down_cb);
+       if (ret < 0)
+               mlog(ML_ERROR, "Status return %d unregistering heartbeat down "
+                    "callback!\n", ret);
+}
+
+static int o2quo_register_hb_callbacks(void)
+{
+       int ret;
+
+       o2hb_setup_callback(&o2quo_hb_down_cb, O2HB_NODE_DOWN_CB,
+                           o2quo_hb_down, NULL, O2QUO_HB_PRI);
+       o2hb_setup_callback(&o2quo_hb_up_cb, O2HB_NODE_UP_CB,
+                           o2quo_hb_up, NULL, O2QUO_HB_PRI);
+       o2hb_setup_callback(&o2quo_hb_conn_down, O2HB_CONN_DOWN_CB,
+                           o2quo_conn_err, NULL, O2QUO_HB_PRI);
+       o2hb_setup_callback(&o2quo_hb_conn_up, O2HB_CONN_UP_CB,
+                           o2quo_conn_up, NULL, O2QUO_HB_PRI);
+
+       ret = o2hb_register_callback(&o2quo_hb_up_cb);
+       if (ret == 0)
+               ret = o2hb_register_callback(&o2quo_hb_down_cb);
+       if (ret == 0)
+               ret = o2hb_register_callback(&o2quo_hb_conn_up);
+       if (ret == 0)
+               ret = o2hb_register_callback(&o2quo_hb_conn_down);
+
+       if (ret)
+               o2quo_unregister_hb_callbacks();
+
+       return ret;
+}
+
+
+int o2quo_init(void)
 {
        struct o2quo_state *qs = &o2quo_state;
+       int i;
 
        spin_lock_init(&qs->qs_lock);
        INIT_WORK(&qs->qs_work, o2quo_make_decision, NULL);
+       for (i = 0; i < O2NM_MAX_NODES; i++)
+               INIT_WORK(&qs->qs_node_work[i], o2quo_hb_still_up, (void *)i);
+
+       return o2quo_register_hb_callbacks();
 }
 
 void o2quo_exit(void)
 {
        flush_scheduled_work();
+       o2quo_unregister_hb_callbacks();
 }
diff -ruNpX dontdiff linux-2.6.15-staging1/fs/ocfs2/cluster/quorum.h 
linux-2.6.15-staging2/fs/ocfs2/cluster/quorum.h
--- linux-2.6.15-staging1/fs/ocfs2/cluster/quorum.h     2006-01-08 
18:23:29.378721672 -0500
+++ linux-2.6.15-staging2/fs/ocfs2/cluster/quorum.h     2006-01-08 
18:23:55.863695344 -0500
@@ -23,14 +23,13 @@
 #ifndef O2CLUSTER_QUORUM_H
 #define O2CLUSTER_QUORUM_H
 
-void o2quo_init(void);
+int o2quo_init(void);
 void o2quo_exit(void);
-
-void o2quo_hb_up(u8 node);
-void o2quo_hb_down(u8 node);
-void o2quo_hb_still_up(u8 node);
-void o2quo_conn_up(u8 node);
-void o2quo_conn_err(u8 node);
 void o2quo_disk_timeout(void);
 
+/* we're delaying our quorum decision so that heartbeat will have timed
+ * out truly dead nodes by the time we come around to making decisions
+ * on their number */
+#define O2NET_QUORUM_DELAY_MS  ((o2hb_dead_threshold + 2) * 
O2HB_REGION_TIMEOUT_MS)
+
 #endif /* O2CLUSTER_QUORUM_H */
diff -ruNpX dontdiff linux-2.6.15-staging1/fs/ocfs2/cluster/tcp.c 
linux-2.6.15-staging2/fs/ocfs2/cluster/tcp.c
--- linux-2.6.15-staging1/fs/ocfs2/cluster/tcp.c        2006-01-08 
18:23:29.379721520 -0500
+++ linux-2.6.15-staging2/fs/ocfs2/cluster/tcp.c        2006-01-08 
18:13:52.646398312 -0500
@@ -67,7 +67,6 @@
 #include "nodemanager.h"
 #define MLOG_MASK_PREFIX ML_TCP
 #include "masklog.h"
-#include "quorum.h"
 
 #include "tcp_internal.h"
 
@@ -128,7 +127,7 @@ static struct workqueue_struct *o2net_wq
 static struct work_struct o2net_listen_work;
 
 static struct o2hb_callback_func o2net_hb_up, o2net_hb_down;
-#define O2NET_HB_PRI 0x1
+#define O2NET_HB_PRI 0x2
 
 static struct o2net_handshake *o2net_hand;
 static struct o2net_msg *o2net_keep_req, *o2net_keep_resp;
@@ -390,9 +389,9 @@ static void o2net_set_nn_state(struct o2
                wake_up(&nn->nn_sc_wq);
 
        if (!was_err && nn->nn_persistent_error) {
-               o2quo_conn_err(o2net_num_from_nn(nn));
-               queue_delayed_work(o2net_wq, &nn->nn_still_up,
-                                  msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
+               u8 node_num = o2net_num_from_nn(nn);
+               struct o2nm_node *node = o2nm_get_node_by_num(node_num);
+               o2hb_notify(O2HB_CONN_DOWN_CB, node, node_num);
        }
 
        if (was_valid && !valid) {
@@ -402,7 +401,11 @@ static void o2net_set_nn_state(struct o2
        }
 
        if (!was_valid && valid) {
-               o2quo_conn_up(o2net_num_from_nn(nn));
+               u8 node_num = o2net_num_from_nn(nn);
+               struct o2nm_node *node = o2nm_get_node_by_num(node_num);
+
+               o2hb_notify(O2HB_CONN_UP_CB, node, node_num);
+
                /* this is a bit of a hack.  we only try reconnecting
                 * when heartbeating starts until we get a connection.
                 * if that connection then dies we don't try reconnecting.
@@ -1424,13 +1427,6 @@ static void o2net_connect_expired(void *
        spin_unlock(&nn->nn_lock);
 }
 
-static void o2net_still_up(void *arg)
-{
-       struct o2net_node *nn = arg;
-
-       o2quo_hb_still_up(o2net_num_from_nn(nn));
-}
-
 /* ------------------------------------------------------------ */
 
 void o2net_disconnect_node(struct o2nm_node *node)
@@ -1445,7 +1441,6 @@ void o2net_disconnect_node(struct o2nm_n
        if (o2net_wq) {
                cancel_delayed_work(&nn->nn_connect_expired);
                cancel_delayed_work(&nn->nn_connect_work);
-               cancel_delayed_work(&nn->nn_still_up);
                flush_workqueue(o2net_wq);
        }
 }
@@ -1453,8 +1448,6 @@ void o2net_disconnect_node(struct o2nm_n
 static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num,
                                  void *data)
 {
-       o2quo_hb_down(node_num);
-
        if (node_num != o2nm_this_node())
                o2net_disconnect_node(node);
 }
@@ -1464,8 +1457,6 @@ static void o2net_hb_node_up_cb(struct o
 {
        struct o2net_node *nn = o2net_nn_from_num(node_num);
 
-       o2quo_hb_up(node_num);
-
        /* ensure an immediate connect attempt */
        nn->nn_last_connect_attempt = jiffies -
                (msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS) + 1);
@@ -1739,7 +1730,7 @@ int o2net_start_listening(struct o2nm_no
                destroy_workqueue(o2net_wq);
                o2net_wq = NULL;
        } else
-               o2quo_conn_up(node->nd_num);
+               o2hb_notify(O2HB_CONN_UP_CB, node, node->nd_num);
 
        return ret;
 }
@@ -1776,7 +1767,7 @@ void o2net_stop_listening(struct o2nm_no
        sock_release(o2net_listen_sock);
        o2net_listen_sock = NULL;
 
-       o2quo_conn_err(node->nd_num);
+       o2hb_notify(O2HB_CONN_DOWN_CB, node, node->nd_num);
 }
 
 /* ------------------------------------------------------------ */
@@ -1785,8 +1776,6 @@ int o2net_init(void)
 {
        unsigned long i;
 
-       o2quo_init();
-
        o2net_hand = kcalloc(1, sizeof(struct o2net_handshake), GFP_KERNEL);
        o2net_keep_req = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL);
        o2net_keep_resp = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL);
@@ -1805,11 +1794,11 @@ int o2net_init(void)
 
        for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
                struct o2net_node *nn = o2net_nn_from_num(i);
+               memset(nn, 0, sizeof (*nn));
 
                spin_lock_init(&nn->nn_lock);
                INIT_WORK(&nn->nn_connect_work, o2net_start_connect, nn);
                INIT_WORK(&nn->nn_connect_expired, o2net_connect_expired, nn);
-               INIT_WORK(&nn->nn_still_up, o2net_still_up, nn);
                /* until we see hb from a node we'll return einval */
                nn->nn_persistent_error = -ENOTCONN;
                init_waitqueue_head(&nn->nn_sc_wq);
@@ -1822,7 +1811,6 @@ int o2net_init(void)
 
 void o2net_exit(void)
 {
-       o2quo_exit();
        kfree(o2net_hand);
        kfree(o2net_keep_req);
        kfree(o2net_keep_resp);
diff -ruNpX dontdiff linux-2.6.15-staging1/fs/ocfs2/cluster/tcp_internal.h 
linux-2.6.15-staging2/fs/ocfs2/cluster/tcp_internal.h
--- linux-2.6.15-staging1/fs/ocfs2/cluster/tcp_internal.h       2006-01-08 
18:23:29.379721520 -0500
+++ linux-2.6.15-staging2/fs/ocfs2/cluster/tcp_internal.h       2006-01-08 
18:13:52.646398312 -0500
@@ -28,12 +28,7 @@
 #define O2NET_MSG_KEEP_RESP_MAGIC ((u16)0xfa58)
 
 /* same as hb delay, we're waiting for another node to recognize our hb */
-#define O2NET_RECONNECT_DELAY_MS       O2HB_REGION_TIMEOUT_MS
-
-/* we're delaying our quorum decision so that heartbeat will have timed
- * out truly dead nodes by the time we come around to making decisions
- * on their number */
-#define O2NET_QUORUM_DELAY_MS  ((o2hb_dead_threshold + 2) * 
O2HB_REGION_TIMEOUT_MS)
+#define O2NET_RECONNECT_DELAY_MS       2000    
 
 #define O2NET_KEEPALIVE_DELAY_SECS     5
 #define O2NET_IDLE_TIMEOUT_SECS                10
@@ -87,11 +82,6 @@ struct o2net_node {
         * established.  this expiring gives up on the node and errors out
         * transmits */
        struct work_struct              nn_connect_expired;
-
-       /* after we give up on a socket we wait a while before deciding
-        * that it is still heartbeating and that we should do some
-        * quorum work */
-       struct work_struct              nn_still_up;
 };
 
 struct o2net_sock_container {
_______________________________________________
Ocfs2-devel mailing list
[email protected]
http://oss.oracle.com/mailman/listinfo/ocfs2-devel

Reply via email to