[SCM] CTDB repository - branch master updated - 0f1883c69c689b28b0c04148774840b2c4081df6

Ronnie Sahlberg Mon, 05 May 2008 22:57:22 -0700

The branch, master has been updated
       via  0f1883c69c689b28b0c04148774840b2c4081df6 (commit)
       via  8556e9dc897c6b9b9be0b52f391effb1f72fbd80 (commit)
       via  e513277fb09b951427be8351d04c877e0a15359d (commit)
      from  7e587acaf8006254e89ff9b4bf48454821c85863 (commit)


http://gitweb.samba.org/?p=sahlberg/ctdb.git;a=shortlog;h=master


- Log -----------------------------------------------------------------
commit 0f1883c69c689b28b0c04148774840b2c4081df6
Author: Ronnie Sahlberg <[EMAIL PROTECTED]>
Date:   Tue May 6 15:42:59 2008 +1000

    Expand the client async framework so that it can take a callback function.
    This allows us to use the async framework also for controls that return
    outdata.
    
    Add a "capabilities" field to the ctdb_node structure. This field is
    only initialized and kept valid inside the recovery daemon context and not
    inside the main ctdb daemon.
    
    change the GET_CAPABILITIES control to return the capabilities in outdata 
instead of in the res return variable.
    
    When performing a recovery inside the recovery daemon, read the 
capabilities from all connected nodes and update the ctdb->nodes list of nodes.
    when building the new vnnmap after the database rebuild in recovery, do not 
include any nodes which lack the LMASTER capability in the new vnnmap.
    Unless there are no available connected node that sports the LMASTER 
capability in which case we let the local node (recmaster) take on the lmaster 
role temporarily (i.e. become a member of the vnnmap list)

commit 8556e9dc897c6b9b9be0b52f391effb1f72fbd80
Author: Ronnie Sahlberg <[EMAIL PROTECTED]>
Date:   Tue May 6 13:56:56 2008 +1000

    make sure we lose all elections for recmaster role if we do not have the 
recmaster capability.
    
    (unless there are no other node at all available with this capability)

commit e513277fb09b951427be8351d04c877e0a15359d
Author: Ronnie Sahlberg <[EMAIL PROTECTED]>
Date:   Tue May 6 13:27:17 2008 +1000

    close and reopen the reclock pnn file at regular intervals.
    
    handle failure to get/hold the reclock pnn file better and just
    treat it as a transient backend filesystem error and try again later
    instead of shutting down the recovery daemon
    
    when we have lost the pnn file   and if we are recmaster
    release the recmaster role so that someone else can become recmaster isntead

-----------------------------------------------------------------------

Summary of changes:
 client/ctdb_client.c   |   21 ++++++--
 config/ctdb.init       |    4 +-
 include/ctdb_private.h |   12 ++++-
 server/ctdb_control.c  |    2 +-
 server/ctdb_recover.c  |   18 +++++++
 server/ctdb_recoverd.c |  123 +++++++++++++++++++++++++++++++++++++++++-------
 tools/ctdb.c           |    2 +-
 7 files changed, 153 insertions(+), 29 deletions(-)


Changeset truncated at 500 lines:

diff --git a/client/ctdb_client.c b/client/ctdb_client.c
index 4f3a0d5..921392c 100644
--- a/client/ctdb_client.c
+++ b/client/ctdb_client.c
@@ -2671,8 +2671,11 @@ int ctdb_ctrl_end_recovery(struct ctdb_context *ctdb, 
struct timeval timeout, ui
 static void async_callback(struct ctdb_client_control_state *state)
 {
        struct client_async_data *data = 
talloc_get_type(state->async.private_data, struct client_async_data);
+       struct ctdb_context *ctdb = talloc_get_type(state->ctdb, struct 
ctdb_context);
        int ret;
+       TDB_DATA outdata;
        int32_t res;
+       uint32_t destnode = state->c->hdr.destnode;
 
        /* one more node has responded with recmode data */
        data->count--;
@@ -2690,13 +2693,16 @@ static void async_callback(struct 
ctdb_client_control_state *state)
        
        state->async.fn = NULL;
 
-       ret = ctdb_control_recv(state->ctdb, state, data, NULL, &res, NULL);
+       ret = ctdb_control_recv(ctdb, state, data, &outdata, &res, NULL);
        if ((ret != 0) || (res != 0)) {
                if ( !data->dont_log_errors) {
                        DEBUG(DEBUG_ERR,("Async operation failed with ret=%d 
res=%d\n", ret, (int)res));
                }
                data->fail_count++;
        }
+       if ((ret == 0) && (data->callback != NULL)) {
+               data->callback(ctdb, destnode, res, outdata);
+       }
 }
 
 
@@ -2739,15 +2745,17 @@ int ctdb_client_async_control(struct ctdb_context *ctdb,
                                uint32_t *nodes,
                                struct timeval timeout,
                                bool dont_log_errors,
-                               TDB_DATA data)
+                               TDB_DATA data,
+                               client_async_callback client_callback)
 {
        struct client_async_data *async_data;
        struct ctdb_client_control_state *state;
        int j, num_nodes;
-       
+
        async_data = talloc_zero(ctdb, struct client_async_data);
        CTDB_NO_MEMORY_FATAL(ctdb, async_data);
        async_data->dont_log_errors = dont_log_errors;
+       async_data->callback = client_callback;
 
        num_nodes = talloc_get_size(nodes) / sizeof(uint32_t);
 
@@ -2886,15 +2894,16 @@ int ctdb_ctrl_getcapabilities_recv(struct ctdb_context 
*ctdb, TALLOC_CTX *mem_ct
 {
        int ret;
        int32_t res;
+       TDB_DATA outdata;
 
-       ret = ctdb_control_recv(ctdb, state, mem_ctx, NULL, &res, NULL);
-       if (ret != 0) {
+       ret = ctdb_control_recv(ctdb, state, mem_ctx, &outdata, &res, NULL);
+       if ( (ret != 0) || (res != 0) ) {
                DEBUG(DEBUG_ERR,(__location__ " ctdb_ctrl_getcapabilities_recv 
failed\n"));
                return -1;
        }
 
        if (capabilities) {
-               *capabilities = (uint32_t)res;
+               *capabilities = *((uint32_t *)outdata.dptr);
        }
 
        return 0;
diff --git a/config/ctdb.init b/config/ctdb.init
index c83c091..922a53d 100755
--- a/config/ctdb.init
+++ b/config/ctdb.init
@@ -66,10 +66,10 @@ CTDB_OPTIONS="$CTDB_OPTIONS --reclock=$CTDB_RECOVERY_LOCK"
 [ -z "$CTDB_START_AS_DISABLED" ] || [ "$CTDB_START_AS_DISABLED" != "yes" ] || {
        CTDB_OPTIONS="$CTDB_OPTIONS --start-as-disabled"
 }
-[ -z "$CTDB_CAPABILITY_RECMASTER" ] || [ "$CTDB_CAPABILITY_RECMASTER" != "yes" 
] || {
+[ -z "$CTDB_CAPABILITY_RECMASTER" ] || [ "$CTDB_CAPABILITY_RECMASTER" != "no" 
] || {
        CTDB_OPTIONS="$CTDB_OPTIONS --no-recmaster"
 }
-[ -z "$CTDB_CAPABILITY_LMASTER" ] || [ "$CTDB_CAPABILITY_LMASTER" != "yes" ] 
|| {
+[ -z "$CTDB_CAPABILITY_LMASTER" ] || [ "$CTDB_CAPABILITY_LMASTER" != "no" ] || 
{
        CTDB_OPTIONS="$CTDB_OPTIONS --no-lmaster"
 }
 
diff --git a/include/ctdb_private.h b/include/ctdb_private.h
index d2abc99..d31b148 100644
--- a/include/ctdb_private.h
+++ b/include/ctdb_private.h
@@ -199,6 +199,11 @@ struct ctdb_node {
        uint32_t rx_cnt;
        uint32_t tx_cnt;
 
+       /* used to track node capabilities, is only valid/tracked inside the
+          recovery daemon.
+       */
+       uint32_t capabilities;
+
        /* a list of controls pending to this node, so we can time them out 
quickly
           if the node becomes disconnected */
        struct daemon_control_state *pending_controls;
@@ -1276,10 +1281,13 @@ int32_t ctdb_monitoring_mode(struct ctdb_context *ctdb);
 int ctdb_set_child_logging(struct ctdb_context *ctdb);
 
 
+typedef void (*client_async_callback)(struct ctdb_context *ctdb, uint32_t 
node_pnn, int32_t res, TDB_DATA outdata);
+
 struct client_async_data {
        bool dont_log_errors;
        uint32_t count;
        uint32_t fail_count;
+       client_async_callback callback;
 };
 void ctdb_client_async_add(struct client_async_data *data, struct 
ctdb_client_control_state *state);
 int ctdb_client_async_wait(struct ctdb_context *ctdb, struct client_async_data 
*data);
@@ -1288,12 +1296,14 @@ int ctdb_client_async_control(struct ctdb_context *ctdb,
                                uint32_t *nodes,
                                struct timeval timeout,
                                bool dont_log_errors,
-                               TDB_DATA data);
+                               TDB_DATA data,
+                               client_async_callback client_callback);
 
 void ctdb_load_nodes_file(struct ctdb_context *ctdb);
 
 int ctdb_control_reload_nodes_file(struct ctdb_context *ctdb, uint32_t opcode);
 
 int32_t ctdb_dump_memory(struct ctdb_context *ctdb, TDB_DATA *outdata);
+int32_t ctdb_control_get_capabilities(struct ctdb_context *ctdb, TDB_DATA 
*outdata);
 
 #endif
diff --git a/server/ctdb_control.c b/server/ctdb_control.c
index a7f16a8..6c8a4fc 100644
--- a/server/ctdb_control.c
+++ b/server/ctdb_control.c
@@ -390,7 +390,7 @@ static int32_t ctdb_control_dispatch(struct ctdb_context 
*ctdb,
                return ctdb_control_del_public_address(ctdb, indata);
 
        case CTDB_CONTROL_GET_CAPABILITIES:
-               return ctdb->capabilities;
+               return ctdb_control_get_capabilities(ctdb, outdata);
 
        default:
                DEBUG(DEBUG_CRIT,(__location__ " Unknown CTDB control opcode 
%u\n", opcode));
diff --git a/server/ctdb_recover.c b/server/ctdb_recover.c
index 83e5424..7a96733 100644
--- a/server/ctdb_recover.c
+++ b/server/ctdb_recover.c
@@ -957,3 +957,21 @@ int32_t ctdb_control_try_delete_records(struct 
ctdb_context *ctdb, TDB_DATA inda
 
        return 0;
 }
+
+/*
+  report capabilities
+ */
+int32_t ctdb_control_get_capabilities(struct ctdb_context *ctdb, TDB_DATA 
*outdata)
+{
+       uint32_t *capabilities = NULL;
+
+       capabilities = talloc(outdata, uint32_t);
+       CTDB_NO_MEMORY(ctdb, capabilities);
+       *capabilities = ctdb->capabilities;
+
+       outdata->dsize = sizeof(uint32_t);
+       outdata->dptr = (uint8_t *)capabilities;
+
+       return 0;       
+}
+
diff --git a/server/ctdb_recoverd.c b/server/ctdb_recoverd.c
index 1a53bb8..c3dff32 100644
--- a/server/ctdb_recoverd.c
+++ b/server/ctdb_recoverd.c
@@ -212,7 +212,7 @@ static int run_recovered_eventscript(struct ctdb_context 
*ctdb, struct ctdb_node
 
        if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
                        list_of_active_nodes(ctdb, nodemap, tmp_ctx, true),
-                       CONTROL_TIMEOUT(), false, tdb_null) != 0) {
+                       CONTROL_TIMEOUT(), false, tdb_null, NULL) != 0) {
                DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' 
event. Recovery failed.\n"));
                talloc_free(tmp_ctx);
                return -1;
@@ -234,7 +234,7 @@ static int run_startrecovery_eventscript(struct 
ctdb_context *ctdb, struct ctdb_
 
        if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
                        list_of_active_nodes(ctdb, nodemap, tmp_ctx, true),
-                       CONTROL_TIMEOUT(), false, tdb_null) != 0) {
+                       CONTROL_TIMEOUT(), false, tdb_null, NULL) != 0) {
                DEBUG(DEBUG_ERR, (__location__ " Unable to run the 
'startrecovery' event. Recovery failed.\n"));
                talloc_free(tmp_ctx);
                return -1;
@@ -244,6 +244,40 @@ static int run_startrecovery_eventscript(struct 
ctdb_context *ctdb, struct ctdb_
        return 0;
 }
 
+static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t 
node_pnn, int32_t res, TDB_DATA outdata)
+{
+       if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
+               DEBUG(DEBUG_ERR, (__location__ " Invalid lenght/pointer for 
getcap callback : %d %p\n", outdata.dsize, outdata.dptr));
+               return;
+       }
+       ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
+}
+
+/*
+  update the node capabilities for all connected nodes
+ */
+static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map 
*nodemap)
+{
+       uint32_t *nodes;
+       TALLOC_CTX *tmp_ctx;
+
+       tmp_ctx = talloc_new(ctdb);
+       CTDB_NO_MEMORY(ctdb, tmp_ctx);
+
+       nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
+
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
+                                       nodes, CONTROL_TIMEOUT(),
+                                       false, tdb_null, async_getcap_callback) 
!= 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Failed to read node 
capabilities.\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
 /*
   change recovery mode on all nodes
  */
@@ -262,7 +296,7 @@ static int set_recovery_mode(struct ctdb_context *ctdb, 
struct ctdb_node_map *no
        if (rec_mode == CTDB_RECOVERY_ACTIVE) {
                if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
                                                nodes, CONTROL_TIMEOUT(),
-                                               false, tdb_null) != 0) {
+                                               false, tdb_null, NULL) != 0) {
                        DEBUG(DEBUG_ERR, (__location__ " Unable to freeze 
nodes. Recovery failed.\n"));
                        talloc_free(tmp_ctx);
                        return -1;
@@ -275,7 +309,7 @@ static int set_recovery_mode(struct ctdb_context *ctdb, 
struct ctdb_node_map *no
 
        if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
                                        nodes, CONTROL_TIMEOUT(),
-                                       false, data) != 0) {
+                                       false, data, NULL) != 0) {
                DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. 
Recovery failed.\n"));
                talloc_free(tmp_ctx);
                return -1;
@@ -284,7 +318,7 @@ static int set_recovery_mode(struct ctdb_context *ctdb, 
struct ctdb_node_map *no
        if (rec_mode == CTDB_RECOVERY_NORMAL) {
                if (ctdb_client_async_control(ctdb, CTDB_CONTROL_THAW,
                                                nodes, CONTROL_TIMEOUT(),
-                                               false, tdb_null) != 0) {
+                                               false, tdb_null, NULL) != 0) {
                        DEBUG(DEBUG_ERR, (__location__ " Unable to thaw nodes. 
Recovery failed.\n"));
                        talloc_free(tmp_ctx);
                        return -1;
@@ -311,7 +345,7 @@ static int set_recovery_master(struct ctdb_context *ctdb, 
struct ctdb_node_map *
 
        if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
                        list_of_active_nodes(ctdb, nodemap, tmp_ctx, true),
-                       CONTROL_TIMEOUT(), false, data) != 0) {
+                       CONTROL_TIMEOUT(), false, data, NULL) != 0) {
                DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. 
Recovery failed.\n"));
                talloc_free(tmp_ctx);
                return -1;
@@ -1142,7 +1176,7 @@ static int push_recdb_database(struct ctdb_context *ctdb, 
uint32_t dbid,
 
        if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
                        list_of_active_nodes(ctdb, nodemap, tmp_ctx, true),
-                       CONTROL_TIMEOUT(), false, outdata) != 0) {
+                       CONTROL_TIMEOUT(), false, outdata, NULL) != 0) {
                DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to 
nodes for db 0x%x\n", dbid));
                talloc_free(recdata);
                talloc_free(tmp_ctx);
@@ -1198,7 +1232,7 @@ static int recover_database(struct ctdb_recoverd *rec,
 
        if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
                        list_of_active_nodes(ctdb, nodemap, recdb, true),
-                       CONTROL_TIMEOUT(), false, data) != 0) {
+                       CONTROL_TIMEOUT(), false, data, NULL) != 0) {
                DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. 
Recovery failed.\n"));
                talloc_free(recdb);
                return -1;
@@ -1321,7 +1355,7 @@ static int do_recovery(struct ctdb_recoverd *rec,
 
        if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
                        list_of_active_nodes(ctdb, nodemap, mem_ctx, true),
-                       CONTROL_TIMEOUT(), false, data) != 0) {
+                       CONTROL_TIMEOUT(), false, data, NULL) != 0) {
                DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. 
Recovery failed.\n"));
                return -1;
        }
@@ -1340,7 +1374,7 @@ static int do_recovery(struct ctdb_recoverd *rec,
        /* commit all the changes */
        if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
                        list_of_active_nodes(ctdb, nodemap, mem_ctx, true),
-                       CONTROL_TIMEOUT(), false, data) != 0) {
+                       CONTROL_TIMEOUT(), false, data, NULL) != 0) {
                DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery 
changes. Recovery failed.\n"));
                return -1;
        }
@@ -1348,19 +1382,45 @@ static int do_recovery(struct ctdb_recoverd *rec,
        DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
        
 
+       /* update the capabilities for all nodes */
+       ret = update_capabilities(ctdb, nodemap);
+       if (ret!=0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to update node 
capabilities.\n"));
+               return -1;
+       }
+
        /* build a new vnn map with all the currently active and
           unbanned nodes */
        generation = new_generation();
        vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
        CTDB_NO_MEMORY(ctdb, vnnmap);
        vnnmap->generation = generation;
-       vnnmap->size = rec->num_active;
+       vnnmap->size = 0;
        vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
+       CTDB_NO_MEMORY(ctdb, vnnmap->map);
        for (i=j=0;i<nodemap->num;i++) {
-               if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
-                       vnnmap->map[j++] = nodemap->nodes[i].pnn;
+               if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
+                       continue;
                }
+               if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
+                       /* this node can not be an lmaster */
+                       DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, 
skipping it\n", i));
+                       continue;
+               }
+
+               vnnmap->size++;
+               vnnmap->map = talloc_realloc_size(vnnmap, vnnmap->map, 
vnnmap->size);
+               CTDB_NO_MEMORY(ctdb, vnnmap->map);
+               vnnmap->map[j++] = nodemap->nodes[i].pnn;
+
        }
+       if (vnnmap->size == 0) {
+               DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local 
node (recmaster) anyway.\n"));
+               vnnmap->size++;
+               vnnmap->map = talloc_realloc_size(vnnmap, vnnmap->map, 
vnnmap->size);
+               CTDB_NO_MEMORY(ctdb, vnnmap->map);
+               vnnmap->map[0] = pnn;
+       }       
 
        /* update to the new vnnmap on all nodes */
        ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
@@ -1481,6 +1541,13 @@ static void ctdb_election_data(struct ctdb_recoverd 
*rec, struct election_messag
                        em->num_connected++;
                }
        }
+
+       /* we shouldnt try to win this election if we cant be a recmaster */
+       if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
+               em->num_connected = 0;
+               em->priority_time = timeval_current();
+       }
+
        talloc_free(nodemap);
 }
 
@@ -1494,6 +1561,11 @@ static bool ctdb_election_win(struct ctdb_recoverd *rec, 
struct election_message
 
        ctdb_election_data(rec, &myem);
 
+       /* we cant win if we dont have the recmaster capability */
+       if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
+               return false;
+       }
+
        /* we cant win if we are banned */
        if (rec->node_flags & NODE_FLAGS_BANNED) {
                return false;
@@ -2017,8 +2089,15 @@ ctdb_recoverd_write_pnn_connect_count(struct 
ctdb_recoverd *rec)
        const char count = rec->num_connected;
        struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct 
ctdb_context);
 
+       if (rec->rec_file_fd == -1) {
+               DEBUG(DEBUG_CRIT,(__location__ " Unable to write pnn count. 
pnnfile is not open.\n"));
+               return;
+       } 
+
        if (pwrite(rec->rec_file_fd, &count, 1, ctdb->pnn) == -1) {
                DEBUG(DEBUG_CRIT, (__location__ " Failed to write pnn 
count\n"));
+               close(rec->rec_file_fd);
+               rec->rec_file_fd = -1;
        }
 }
 
@@ -2038,8 +2117,8 @@ ctdb_recoverd_get_pnn_lock(struct ctdb_recoverd *rec)
        DEBUG(DEBUG_INFO, ("Setting PNN lock for pnn:%d\n", ctdb->pnn));
 
        if (rec->rec_file_fd != -1) {
-               DEBUG(DEBUG_CRIT, (__location__ " rec_lock_fd is already open. 
Aborting\n"));
-               exit(10);
+               close(rec->rec_file_fd);
+               rec->rec_file_fd = -1;
        }
 
        pnnfile = talloc_asprintf(rec, "%s.pnn", ctdb->recovery_lock_file);
@@ -2049,7 +2128,8 @@ ctdb_recoverd_get_pnn_lock(struct ctdb_recoverd *rec)
        if (rec->rec_file_fd == -1) {
                DEBUG(DEBUG_CRIT,(__location__ " Unable to open %s - (%s)\n", 
                         pnnfile, strerror(errno)));
-               exit(10);
+               talloc_free(pnnfile);
+               return;
        }
 
        set_close_on_exec(rec->rec_file_fd);
@@ -2063,12 +2143,12 @@ ctdb_recoverd_get_pnn_lock(struct ctdb_recoverd *rec)
                close(rec->rec_file_fd);
                rec->rec_file_fd = -1;
                DEBUG(DEBUG_CRIT,(__location__ " Failed to get pnn lock on 
'%s'\n", pnnfile));
-               exit(10);
+               talloc_free(pnnfile);
+               return;
        }
 
 
        DEBUG(DEBUG_NOTICE,(__location__ " Got pnn lock on '%s'\n", pnnfile));
-
        talloc_free(pnnfile);
 
        /* we start out with 0 connected nodes */
@@ -2086,6 +2166,9 @@ static void ctdb_update_pnn_count(struct event_context 
*ev, struct timed_event *
        struct ctdb_context *ctdb     = rec->ctdb;
        struct ctdb_node_map *nodemap = rec->nodemap;
 
+       /* close and reopen the pnn lock file */
+       ctdb_recoverd_get_pnn_lock(rec);
+
        ctdb_recoverd_write_pnn_connect_count(rec);
 
        event_add_timed(rec->ctdb->ev, rec->ctdb,
@@ -2108,6 +2191,10 @@ static void ctdb_update_pnn_count(struct event_context 
*ev, struct timed_event *
                return;
        }
        if (ctdb->recovery_lock_fd == -1) {
+               DEBUG(DEBUG_ERR, (__location__ " Lost reclock pnn file. 
Yielding recmaster role\n"));
+               close(ctdb->recovery_lock_fd);
+               ctdb->recovery_lock_fd = -1;
+               force_election(rec, ctdb->pnn, rec->nodemap);
                return;
        }
        for (i=0; i<nodemap->num; i++) {
diff --git a/tools/ctdb.c b/tools/ctdb.c
index 3968e0d..6f78b65 100644
--- a/tools/ctdb.c
+++ b/tools/ctdb.c
@@ -443,7 +443,7 @@ struct sockaddr_in *sin)
        /* send release ip to all nodes */
        if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RELEASE_IP,
                        list_of_active_nodes(ctdb, nodemap, ctdb, true),
-                       TIMELIMIT(), false, data) != 0) {
+                       TIMELIMIT(), false, data, NULL) != 0) {
                DEBUG(DEBUG_ERR, (__location__ " Unable to send 'ReleaseIP' to 
all nodes.\n"));
                return -1;
        }


-- 
CTDB repository

[SCM] CTDB repository - branch master updated - 0f1883c69c689b28b0c04148774840b2c4081df6

Reply via email to