Hi Andrew,

>> All the more reason to start using the stonith api directly.
>> I was playing around list night with the dlm_controld.pcmk code:
>>    
>> https://github.com/beekhof/dlm/commit/9f890a36f6844c2a0567aea0a0e29cc47b01b787
> 
> Doesn't seem to apply to 3.0.17, so I rebased that commit against it for
> my build. Then it doesn't compile without attached patch.
> It may need to be rebased a bit against your tree.
> 
> Now I have package built and am building node images. Will try shortly.

Fencing from within dlm_controld.pcmk still did not work with your first
patch against that _no_mainloop function (expected).

So I did my best to build packages from the current git tree.

Voila! I got failed node correctly fenced!
I'll do some more extensive testing next days, but I believe everything
should be much better now.

I knew you're genius he-he ;)

So, here are steps to get DLM handle CPG NODEDOWN events correctly with
pacemaker using openais stack:

1. Build pacemaker (as of 2011-09-28) from git.
2. Apply attached patches to cluster-3.0.17 source tree.
3. Build dlm_controld.pcmk

One note - gfs2_controld probably needs to be fixed too (FIXME).

Best regards,
Vladislav
diff -urNp cluster-3.0.17.orig/group/dlm_controld/cpg.c cluster-3.0.17/group/dlm_controld/cpg.c
--- cluster-3.0.17.orig/group/dlm_controld/cpg.c	2010-10-04 12:24:34.000000000 +0000
+++ cluster-3.0.17/group/dlm_controld/cpg.c	2011-09-05 09:09:49.042858374 +0000
@@ -1446,7 +1446,8 @@ static int add_change(struct lockspace *
 		log_group(ls, "add_change cg %u remove nodeid %d reason %d",
 			  cg->seq, memb->nodeid, left_list[i].reason);
 
-		if (left_list[i].reason == CPG_REASON_PROCDOWN)
+		if (left_list[i].reason == CPG_REASON_NODEDOWN ||
+		    left_list[i].reason == CPG_REASON_PROCDOWN)
 			kick_node_from_cluster(memb->nodeid);
 	}
 
diff -urNp cluster-3.0.17.orig/group/dlm_controld/pacemaker.c cluster-3.0.17/group/dlm_controld/pacemaker.c
--- cluster-3.0.17.orig/group/dlm_controld/pacemaker.c	2010-10-04 12:24:34.000000000 +0000
+++ cluster-3.0.17/group/dlm_controld/pacemaker.c	2011-09-28 08:41:53.617063138 +0000
@@ -220,33 +230,17 @@ char *nodeid2name(int nodeid) {
     return strdup(node->uname);
 }
 
-static int pcmk_cluster_fd = 0;
-
-static void attrd_deadfn(int ci) 
-{
-    log_error("%s: Lost connection to the cluster", __FUNCTION__);
-    pcmk_cluster_fd = 0;
-    return;
-}
-
 void kick_node_from_cluster(int nodeid)
 {
-    int fd = pcmk_cluster_fd;
-    int rc = crm_terminate_member_no_mainloop(nodeid, NULL, &fd);
-    
-    if(fd > 0 && fd != pcmk_cluster_fd) {
-	pcmk_cluster_fd = fd;
-	client_add(pcmk_cluster_fd, NULL, attrd_deadfn);
-    }
-    
+    int rc = crm_terminate_member_no_mainloop(nodeid, NULL, NULL);   
     switch(rc) {
-	case 1:
+	case 0:
 	    log_debug("Requested that node %d be kicked from the cluster", nodeid);
 	    break;
 	case -1:
 	    log_error("Don't know how to kick node %d from the cluster", nodeid);
 	    break;
-	case 0:
+	case 1:
 	    log_error("Could not kick node %d from the cluster", nodeid);
 	    break;
 	default:
@@ -256,107 +250,55 @@ void kick_node_from_cluster(int nodeid)
     return;
 }
 
-cib_t *cib = NULL;
-
-static void cib_deadfn(int ci) 
-{
-    log_error("Lost connection to the cib");
-    cib = NULL; /* TODO: memory leak in unlikely error path */
-    return;
-}
-
-static cib_t *cib_connect(void) 
-{
-    int rc = 0;
-    int cib_fd = 0;
-    if(cib) {
-	return cib;
-    }
-    
-    cib = cib_new();
-    rc = cib->cmds->signon_raw(cib, crm_system_name, cib_command, &cib_fd, NULL);
-    if(rc != cib_ok) {
-	log_error("Signon to cib failed: %s", cib_error2string(rc));
-	cib = NULL; /* TODO: memory leak in unlikely error path */
-
-    } else {
-	client_add(cib_fd, NULL, cib_deadfn);
-    }
-    return cib;
-}
-
-
 int fence_in_progress(int *in_progress)
 {
-    int rc = 0;
-    xmlNode *xpath_data;
-
-    cib_connect();    
-    if(cib == NULL) {
-	return -1;
-    }
-
-    /* TODO: Not definitive - but a good approximation */
-    rc = cib->cmds->query(cib, "//nvpar[@name='terminate']", &xpath_data,
-			  cib_xpath|cib_scope_local|cib_sync_call);
-
-    if(xpath_data == NULL) {
-	*in_progress = 0;
-	return 0;
-    }
-
-    log_debug("Fencing in progress: %s", xpath_data?"true":"false");	
-    free_xml(xpath_data);
-    *in_progress = 1;
-    return 1;
+    *in_progress = 0;
+    return 0;
 }
 
-#define XPATH_MAX  1024
-
 int fence_node_time(int nodeid, uint64_t *last_fenced_time)
 {
     int rc = 0;
-    static time_t last_log = 0;
-
-    xmlNode *xpath_data;
-    char xpath_query[XPATH_MAX];
-    crm_node_t *node = crm_get_peer(nodeid, NULL);
+    const char *uname = NULL;
+    crm_node_t *node = crm_get_peer(nodeid, uname);
+    stonith_history_t *history, *hp = NULL;
 
     if(last_fenced_time) {
 	*last_fenced_time = 0;
     }
 
-    if(node == NULL || node->uname == NULL) {
-	log_error("Nothing known about node %d", nodeid);	
-	return 0;
-    }
+    if (node && node->uname) {
+        uname = node->uname;
+        st = stonith_api_new();
 
-    cib_connect();
-    if(cib == NULL) {
-	return -1;
+    } else {
+        crm_err("Nothing known about node id=%d", nodeid);
+        return 0;
+    }
+    
+    if(st) {
+	rc = st->cmds->connect(st, crm_system_name, NULL);
     }
 
-    snprintf(xpath_query, XPATH_MAX, "//lrm[@id='%s']", node->uname);
-    rc = cib->cmds->query(
-	cib, xpath_query, &xpath_data, cib_xpath|cib_scope_local|cib_sync_call);
-
-    if(xpath_data == NULL) {
-	/* the node has been shot - return 'now' */
-	log_level(LOG_INFO, "Node %d/%s was last shot 'now'", nodeid, node->uname);	
-	*last_fenced_time = time(NULL);
-	last_log = 0;
-
+    if(rc == stonith_ok) {
+        st->cmds->history(st, st_opt_sync_call, uname, &history, 120);
+        for(hp = history; hp; hp = hp->next) {
+            if(hp->state == st_done) {
+                *last_fenced_time = hp->completed;
+            }
+        }
+    }
+    
+    if(*last_fenced_time != 0) {
+        log_debug("Node %d/%s was last shot at: %s", nodeid, ctime(*last_fenced_time));	
     } else {
-	time_t now = time(NULL);
-	if(last_log == 0) {
-	    log_level(LOG_INFO, "Node %d/%s has not been shot yet", nodeid, node->uname);
-
-	} else if(now - last_log > 30) {
-	    log_level(LOG_DEBUG, "Node %d/%s has still not been shot yet", nodeid, node->uname);
-	}
-	last_log = now;
+        log_debug("It does not appear node %d/%s has been shot", nodeid, uname);
     }
-
-    free_xml(xpath_data);
+    
+    if(st) {
+        st->cmds->disconnect(st);
+        stonith_api_delete(st);
+    }
+        
     return 0;
 }
--- cluster-3.0.17/group/dlm_controld/Makefile.orig	2010-10-04 12:24:34.000000000 +0000
+++ cluster-3.0.17/group/dlm_controld/Makefile	2011-09-28 09:01:23.453252437 +0000
@@ -54,7 +54,7 @@ LDFLAGS += -L${libdir}
 
 LDDEPS += ../lib/libgroup.a
 
-PCMK_LDFLAGS += -lcib -lcrmcommon -lcrmcluster -ltotem_pg
+PCMK_LDFLAGS += -lcib -lcrmcommon -lcrmcluster -ltotem_pg -lplumb -lstonithd
 PCMK_LDFLAGS += `pkg-config glib-2.0 --libs`
 PCMK_LDFLAGS += `xml2-config --libs`
 
--- cluster-3.0.17/group/dlm_controld/pacemaker.c.orig	2011-09-28 08:49:00.000000000 +0000
+++ cluster-3.0.17/group/dlm_controld/pacemaker.c	2011-09-28 08:59:50.678375731 +0000
@@ -20,6 +20,7 @@
 #include <pacemaker/crm/common/ipc.h>
 #include <pacemaker/crm/msg_xml.h>
 #include <pacemaker/crm/cib.h>
+#include <pacemaker/crm/stonith-ng.h>
 
 #define COMMS_DIR     "/sys/kernel/config/dlm/cluster/comms"
 
@@ -249,16 +250,17 @@ int fence_in_progress(int *in_progress)
 int fence_node_time(int nodeid, uint64_t *last_fenced_time)
 {
     int rc = 0;
-    const char *uname = NULL;
-    crm_node_t *node = crm_get_peer(nodeid, uname);
+    const char *node_uname = NULL;
+    crm_node_t *node = crm_get_peer(nodeid, node_uname);
     stonith_history_t *history, *hp = NULL;
+    stonith_t *st = NULL;
 
     if(last_fenced_time) {
 	*last_fenced_time = 0;
     }
 
     if (node && node->uname) {
-        uname = node->uname;
+        node_uname = node->uname;
         st = stonith_api_new();
 
     } else {
@@ -271,7 +273,7 @@ int fence_node_time(int nodeid, uint64_t
     }
 
     if(rc == stonith_ok) {
-        st->cmds->history(st, st_opt_sync_call, uname, &history, 120);
+        st->cmds->history(st, st_opt_sync_call, node_uname, &history, 120);
         for(hp = history; hp; hp = hp->next) {
             if(hp->state == st_done) {
                 *last_fenced_time = hp->completed;
@@ -280,9 +282,9 @@ int fence_node_time(int nodeid, uint64_t
     }
     
     if(*last_fenced_time != 0) {
-        log_debug("Node %d/%s was last shot at: %s", nodeid, ctime(*last_fenced_time));	
+        log_debug("Node %d/%s was last shot at: %s", nodeid, node_uname, ctime(last_fenced_time));
     } else {
-        log_debug("It does not appear node %d/%s has been shot", nodeid, uname);
+        log_debug("It does not appear node %d/%s has been shot", nodeid, node_uname);
     }
     
     if(st) {
_______________________________________________
Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
http://oss.clusterlabs.org/mailman/listinfo/pacemaker

Project Home: http://www.clusterlabs.org
Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
Bugs: http://developerbugs.linux-foundation.org/enter_bug.cgi?product=Pacemaker

Reply via email to