When server A sends the leader a request to remove server B from the cluster, where A != B, the leader sends both A and B a notification when the removal is complete. Until now, however, the notification (which is a raft_remove_server_reply message) did not say which server had been removed, and the receiver did not check. Instead, the receiver assumed that it had been removed. The result was that B was removed and A stopped serving out the database even though it was still part of the cluster, This commit fixes the problem.
Reported-by: ramteja tadishetti <[email protected]> Signed-off-by: Ben Pfaff <[email protected]> --- ovsdb/raft-rpc.c | 5 +++++ ovsdb/raft-rpc.h | 7 +++++++ ovsdb/raft.c | 50 ++++++++++++++++++++++++++++++++++++++------------ 3 files changed, 50 insertions(+), 12 deletions(-) diff --git a/ovsdb/raft-rpc.c b/ovsdb/raft-rpc.c index 13aee0c4bac5..56c07d4879ba 100644 --- a/ovsdb/raft-rpc.c +++ b/ovsdb/raft-rpc.c @@ -460,6 +460,10 @@ static void raft_remove_server_reply_to_jsonrpc(const struct raft_remove_server_reply *rpy, struct json *args) { + if (!uuid_is_zero(&rpy->target_sid)) { + json_object_put_format(args, "target_server", + UUID_FMT, UUID_ARGS(&rpy->target_sid)); + } json_object_put(args, "success", json_boolean_create(rpy->success)); } @@ -468,6 +472,7 @@ raft_remove_server_reply_from_jsonrpc(struct ovsdb_parser *p, struct raft_remove_server_reply *rpy) { rpy->success = raft_parse_required_boolean(p, "success"); + raft_parse_optional_uuid(p, "target_server", &rpy->target_sid); } static void diff --git a/ovsdb/raft-rpc.h b/ovsdb/raft-rpc.h index 15ddf012838c..bdc3429cc67c 100644 --- a/ovsdb/raft-rpc.h +++ b/ovsdb/raft-rpc.h @@ -205,6 +205,13 @@ struct raft_add_server_reply { struct raft_remove_server_reply { struct raft_rpc_common common; bool success; + + /* SID of the removed server, but all-zeros if it is the same as the + * destination of the RPC. (Older ovsdb-server did not have 'target_sid' + * and assumed that the destination was always the target, so by omitting + * 'target_sid' when this is the case we can preserve a small amount of + * inter-version compatibility.) */ + struct uuid target_sid; }; struct raft_install_snapshot_request { diff --git a/ovsdb/raft.c b/ovsdb/raft.c index 07884820ed9b..753881586334 100644 --- a/ovsdb/raft.c +++ b/ovsdb/raft.c @@ -296,6 +296,7 @@ static void raft_send_remove_server_reply__( struct raft *, const struct uuid *target_sid, const struct uuid *requester_sid, struct unixctl_conn *requester_conn, bool success, const char *comment); +static void raft_finished_leaving_cluster(struct raft *); static void raft_server_init_leader(struct raft *, struct raft_server *); @@ -303,6 +304,7 @@ static bool raft_rpc_is_heartbeat(const union raft_rpc *); static bool raft_is_rpc_synced(const struct raft *, const union raft_rpc *); static void raft_handle_rpc(struct raft *, const union raft_rpc *); + static bool raft_send_at(struct raft *, const union raft_rpc *, int line_number); #define raft_send(raft, rpc) raft_send_at(raft, rpc, __LINE__) @@ -2197,16 +2199,28 @@ raft_send_add_server_reply__(struct raft *raft, const struct uuid *sid, } static void -raft_send_remove_server_reply_rpc(struct raft *raft, const struct uuid *sid, +raft_send_remove_server_reply_rpc(struct raft *raft, + const struct uuid *dst_sid, + const struct uuid *target_sid, bool success, const char *comment) { + if (uuid_equals(&raft->sid, dst_sid)) { + if (success && uuid_equals(&raft->sid, target_sid)) { + raft_finished_leaving_cluster(raft); + } + return; + } + const union raft_rpc rpy = { .remove_server_reply = { .common = { .type = RAFT_RPC_REMOVE_SERVER_REPLY, - .sid = *sid, + .sid = *dst_sid, .comment = CONST_CAST(char *, comment), }, + .target_sid = (uuid_equals(dst_sid, target_sid) + ? UUID_ZERO + : *target_sid), .success = success, } }; @@ -2235,6 +2249,9 @@ raft_send_remove_server_reply__(struct raft *raft, } else { char buf[SID_LEN + 1]; ds_put_cstr(&s, raft_get_nickname(raft, target_sid, buf, sizeof buf)); + if (uuid_equals(target_sid, &raft->sid)) { + ds_put_cstr(&s, " (ourselves)"); + } } ds_put_format(&s, " from cluster "CID_FMT" %s", CID_ARGS(&raft->cid), @@ -2251,11 +2268,12 @@ raft_send_remove_server_reply__(struct raft *raft, * allows it to be sure that it's really removed and update its log and * disconnect permanently. */ if (!uuid_is_zero(requester_sid)) { - raft_send_remove_server_reply_rpc(raft, requester_sid, + raft_send_remove_server_reply_rpc(raft, requester_sid, target_sid, success, comment); } if (!uuid_equals(requester_sid, target_sid)) { - raft_send_remove_server_reply_rpc(raft, target_sid, success, comment); + raft_send_remove_server_reply_rpc(raft, target_sid, target_sid, + success, comment); } if (requester_conn) { if (success) { @@ -3559,17 +3577,25 @@ raft_handle_remove_server_request(struct raft *raft, } static void -raft_handle_remove_server_reply(struct raft *raft, - const struct raft_remove_server_reply *rpc) +raft_finished_leaving_cluster(struct raft *raft) { - if (rpc->success) { - VLOG_INFO(SID_FMT": finished leaving cluster "CID_FMT, - SID_ARGS(&raft->sid), CID_ARGS(&raft->cid)); + VLOG_INFO(SID_FMT": finished leaving cluster "CID_FMT, + SID_ARGS(&raft->sid), CID_ARGS(&raft->cid)); - raft_record_note(raft, "left", "this server left the cluster"); + raft_record_note(raft, "left", "this server left the cluster"); + + raft->leaving = false; + raft->left = true; +} - raft->leaving = false; - raft->left = true; +static void +raft_handle_remove_server_reply(struct raft *raft, + const struct raft_remove_server_reply *rpc) +{ + if (rpc->success + && (uuid_is_zero(&rpc->target_sid) + || uuid_equals(&rpc->target_sid, &raft->sid))) { + raft_finished_leaving_cluster(raft); } } -- 2.16.1 _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
