Hi Andrew,
>> All the more reason to start using the stonith api directly.
>> I was playing around list night with the dlm_controld.pcmk code:
>>
>> https://github.com/beekhof/dlm/commit/9f890a36f6844c2a0567aea0a0e29cc47b01b787
>
> Doesn't seem to apply to 3.0.17, so I rebased that commit against it for
> my build. Then it doesn't compile without attached patch.
> It may need to be rebased a bit against your tree.
>
> Now I have package built and am building node images. Will try shortly.
Fencing from within dlm_controld.pcmk still did not work with your first
patch against that _no_mainloop function (expected).
So I did my best to build packages from the current git tree.
Voila! I got failed node correctly fenced!
I'll do some more extensive testing next days, but I believe everything
should be much better now.
I knew you're genius he-he ;)
So, here are steps to get DLM handle CPG NODEDOWN events correctly with
pacemaker using openais stack:
1. Build pacemaker (as of 2011-09-28) from git.
2. Apply attached patches to cluster-3.0.17 source tree.
3. Build dlm_controld.pcmk
One note - gfs2_controld probably needs to be fixed too (FIXME).
Best regards,
Vladislav
diff -urNp cluster-3.0.17.orig/group/dlm_controld/cpg.c cluster-3.0.17/group/dlm_controld/cpg.c
--- cluster-3.0.17.orig/group/dlm_controld/cpg.c 2010-10-04 12:24:34.000000000 +0000
+++ cluster-3.0.17/group/dlm_controld/cpg.c 2011-09-05 09:09:49.042858374 +0000
@@ -1446,7 +1446,8 @@ static int add_change(struct lockspace *
log_group(ls, "add_change cg %u remove nodeid %d reason %d",
cg->seq, memb->nodeid, left_list[i].reason);
- if (left_list[i].reason == CPG_REASON_PROCDOWN)
+ if (left_list[i].reason == CPG_REASON_NODEDOWN ||
+ left_list[i].reason == CPG_REASON_PROCDOWN)
kick_node_from_cluster(memb->nodeid);
}
diff -urNp cluster-3.0.17.orig/group/dlm_controld/pacemaker.c cluster-3.0.17/group/dlm_controld/pacemaker.c
--- cluster-3.0.17.orig/group/dlm_controld/pacemaker.c 2010-10-04 12:24:34.000000000 +0000
+++ cluster-3.0.17/group/dlm_controld/pacemaker.c 2011-09-28 08:41:53.617063138 +0000
@@ -220,33 +230,17 @@ char *nodeid2name(int nodeid) {
return strdup(node->uname);
}
-static int pcmk_cluster_fd = 0;
-
-static void attrd_deadfn(int ci)
-{
- log_error("%s: Lost connection to the cluster", __FUNCTION__);
- pcmk_cluster_fd = 0;
- return;
-}
-
void kick_node_from_cluster(int nodeid)
{
- int fd = pcmk_cluster_fd;
- int rc = crm_terminate_member_no_mainloop(nodeid, NULL, &fd);
-
- if(fd > 0 && fd != pcmk_cluster_fd) {
- pcmk_cluster_fd = fd;
- client_add(pcmk_cluster_fd, NULL, attrd_deadfn);
- }
-
+ int rc = crm_terminate_member_no_mainloop(nodeid, NULL, NULL);
switch(rc) {
- case 1:
+ case 0:
log_debug("Requested that node %d be kicked from the cluster", nodeid);
break;
case -1:
log_error("Don't know how to kick node %d from the cluster", nodeid);
break;
- case 0:
+ case 1:
log_error("Could not kick node %d from the cluster", nodeid);
break;
default:
@@ -256,107 +250,55 @@ void kick_node_from_cluster(int nodeid)
return;
}
-cib_t *cib = NULL;
-
-static void cib_deadfn(int ci)
-{
- log_error("Lost connection to the cib");
- cib = NULL; /* TODO: memory leak in unlikely error path */
- return;
-}
-
-static cib_t *cib_connect(void)
-{
- int rc = 0;
- int cib_fd = 0;
- if(cib) {
- return cib;
- }
-
- cib = cib_new();
- rc = cib->cmds->signon_raw(cib, crm_system_name, cib_command, &cib_fd, NULL);
- if(rc != cib_ok) {
- log_error("Signon to cib failed: %s", cib_error2string(rc));
- cib = NULL; /* TODO: memory leak in unlikely error path */
-
- } else {
- client_add(cib_fd, NULL, cib_deadfn);
- }
- return cib;
-}
-
-
int fence_in_progress(int *in_progress)
{
- int rc = 0;
- xmlNode *xpath_data;
-
- cib_connect();
- if(cib == NULL) {
- return -1;
- }
-
- /* TODO: Not definitive - but a good approximation */
- rc = cib->cmds->query(cib, "//nvpar[@name='terminate']", &xpath_data,
- cib_xpath|cib_scope_local|cib_sync_call);
-
- if(xpath_data == NULL) {
- *in_progress = 0;
- return 0;
- }
-
- log_debug("Fencing in progress: %s", xpath_data?"true":"false");
- free_xml(xpath_data);
- *in_progress = 1;
- return 1;
+ *in_progress = 0;
+ return 0;
}
-#define XPATH_MAX 1024
-
int fence_node_time(int nodeid, uint64_t *last_fenced_time)
{
int rc = 0;
- static time_t last_log = 0;
-
- xmlNode *xpath_data;
- char xpath_query[XPATH_MAX];
- crm_node_t *node = crm_get_peer(nodeid, NULL);
+ const char *uname = NULL;
+ crm_node_t *node = crm_get_peer(nodeid, uname);
+ stonith_history_t *history, *hp = NULL;
if(last_fenced_time) {
*last_fenced_time = 0;
}
- if(node == NULL || node->uname == NULL) {
- log_error("Nothing known about node %d", nodeid);
- return 0;
- }
+ if (node && node->uname) {
+ uname = node->uname;
+ st = stonith_api_new();
- cib_connect();
- if(cib == NULL) {
- return -1;
+ } else {
+ crm_err("Nothing known about node id=%d", nodeid);
+ return 0;
+ }
+
+ if(st) {
+ rc = st->cmds->connect(st, crm_system_name, NULL);
}
- snprintf(xpath_query, XPATH_MAX, "//lrm[@id='%s']", node->uname);
- rc = cib->cmds->query(
- cib, xpath_query, &xpath_data, cib_xpath|cib_scope_local|cib_sync_call);
-
- if(xpath_data == NULL) {
- /* the node has been shot - return 'now' */
- log_level(LOG_INFO, "Node %d/%s was last shot 'now'", nodeid, node->uname);
- *last_fenced_time = time(NULL);
- last_log = 0;
-
+ if(rc == stonith_ok) {
+ st->cmds->history(st, st_opt_sync_call, uname, &history, 120);
+ for(hp = history; hp; hp = hp->next) {
+ if(hp->state == st_done) {
+ *last_fenced_time = hp->completed;
+ }
+ }
+ }
+
+ if(*last_fenced_time != 0) {
+ log_debug("Node %d/%s was last shot at: %s", nodeid, ctime(*last_fenced_time));
} else {
- time_t now = time(NULL);
- if(last_log == 0) {
- log_level(LOG_INFO, "Node %d/%s has not been shot yet", nodeid, node->uname);
-
- } else if(now - last_log > 30) {
- log_level(LOG_DEBUG, "Node %d/%s has still not been shot yet", nodeid, node->uname);
- }
- last_log = now;
+ log_debug("It does not appear node %d/%s has been shot", nodeid, uname);
}
-
- free_xml(xpath_data);
+
+ if(st) {
+ st->cmds->disconnect(st);
+ stonith_api_delete(st);
+ }
+
return 0;
}
--- cluster-3.0.17/group/dlm_controld/Makefile.orig 2010-10-04 12:24:34.000000000 +0000
+++ cluster-3.0.17/group/dlm_controld/Makefile 2011-09-28 09:01:23.453252437 +0000
@@ -54,7 +54,7 @@ LDFLAGS += -L${libdir}
LDDEPS += ../lib/libgroup.a
-PCMK_LDFLAGS += -lcib -lcrmcommon -lcrmcluster -ltotem_pg
+PCMK_LDFLAGS += -lcib -lcrmcommon -lcrmcluster -ltotem_pg -lplumb -lstonithd
PCMK_LDFLAGS += `pkg-config glib-2.0 --libs`
PCMK_LDFLAGS += `xml2-config --libs`
--- cluster-3.0.17/group/dlm_controld/pacemaker.c.orig 2011-09-28 08:49:00.000000000 +0000
+++ cluster-3.0.17/group/dlm_controld/pacemaker.c 2011-09-28 08:59:50.678375731 +0000
@@ -20,6 +20,7 @@
#include <pacemaker/crm/common/ipc.h>
#include <pacemaker/crm/msg_xml.h>
#include <pacemaker/crm/cib.h>
+#include <pacemaker/crm/stonith-ng.h>
#define COMMS_DIR "/sys/kernel/config/dlm/cluster/comms"
@@ -249,16 +250,17 @@ int fence_in_progress(int *in_progress)
int fence_node_time(int nodeid, uint64_t *last_fenced_time)
{
int rc = 0;
- const char *uname = NULL;
- crm_node_t *node = crm_get_peer(nodeid, uname);
+ const char *node_uname = NULL;
+ crm_node_t *node = crm_get_peer(nodeid, node_uname);
stonith_history_t *history, *hp = NULL;
+ stonith_t *st = NULL;
if(last_fenced_time) {
*last_fenced_time = 0;
}
if (node && node->uname) {
- uname = node->uname;
+ node_uname = node->uname;
st = stonith_api_new();
} else {
@@ -271,7 +273,7 @@ int fence_node_time(int nodeid, uint64_t
}
if(rc == stonith_ok) {
- st->cmds->history(st, st_opt_sync_call, uname, &history, 120);
+ st->cmds->history(st, st_opt_sync_call, node_uname, &history, 120);
for(hp = history; hp; hp = hp->next) {
if(hp->state == st_done) {
*last_fenced_time = hp->completed;
@@ -280,9 +282,9 @@ int fence_node_time(int nodeid, uint64_t
}
if(*last_fenced_time != 0) {
- log_debug("Node %d/%s was last shot at: %s", nodeid, ctime(*last_fenced_time));
+ log_debug("Node %d/%s was last shot at: %s", nodeid, node_uname, ctime(last_fenced_time));
} else {
- log_debug("It does not appear node %d/%s has been shot", nodeid, uname);
+ log_debug("It does not appear node %d/%s has been shot", nodeid, node_uname);
}
if(st) {
_______________________________________________
Pacemaker mailing list: [email protected]
http://oss.clusterlabs.org/mailman/listinfo/pacemaker
Project Home: http://www.clusterlabs.org
Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
Bugs: http://developerbugs.linux-foundation.org/enter_bug.cgi?product=Pacemaker