During the course of debugging a clustered DB environment, all members
of the southbound database cluster were destroyed (i.e. the .db files
were removed from disk) and then restarted. Once this happened,
ovn-northd and ovn-controller could not interact with the southbound
database because they both detected all members of the cluster as having
"stale" data. The only course of action was to reset ovn-northd and all
ovn-controllers. It is possible to have this happen with the northbound
database as well if it is clustered.
This patch offers new ovn-appctl commands for ovn-northd and
ovn-controller that allows for it to reset its clustered status. This
allows for it to interact with the database successfully after a cluster
teardown and restart.
Signed-off-by: Mark Michelson <[email protected]>
Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1829109
---
controller/ovn-controller.8.xml | 16 ++++++++++++++++
controller/ovn-controller.c | 30 ++++++++++++++++++++++++++---
northd/ovn-northd.8.xml | 28 +++++++++++++++++++++++++++
northd/ovn-northd.c | 34 +++++++++++++++++++++++++++++++++
4 files changed, 105 insertions(+), 3 deletions(-)
diff --git a/controller/ovn-controller.8.xml b/controller/ovn-controller.8.xml
index 76bbbdc5f..fe62163fa 100644
--- a/controller/ovn-controller.8.xml
+++ b/controller/ovn-controller.8.xml
@@ -491,6 +491,22 @@
recomputes are cpu intensive.
</p>
</dd>
+
+ <dt><code>sb-cluster-state-reset</code></dt>
+ <dd>
+ <p>
+ Reset southbound database cluster status when databases are destroyed
+ and rebuilt.
+ </p>
+ <p>
+ If all databases in a clustered southbound database are removed from
+ disk, then the stored index of all databases will be reset to zero.
+ This will cause ovn-controller to be unable to read or write to the
+ southbound database, because it will always detect the data as stale.
+ In such a case, run this command so that ovn-controller will reset its
+ local index so that it can interact with the southbound database again.
+ </p>
+ </dd>
</dl>
</p>
diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
index 6ff897325..1442accd7 100644
--- a/controller/ovn-controller.c
+++ b/controller/ovn-controller.c
@@ -73,6 +73,7 @@ static unixctl_cb_func extend_table_list;
static unixctl_cb_func inject_pkt;
static unixctl_cb_func ovn_controller_conn_show;
static unixctl_cb_func engine_recompute_cmd;
+static unixctl_cb_func cluster_state_reset_cmd;
#define DEFAULT_BRIDGE_NAME "br-int"
#define DEFAULT_PROBE_INTERVAL_MSEC 5000
@@ -446,7 +447,7 @@ get_ofctrl_probe_interval(struct ovsdb_idl *ovs_idl)
* updates 'sbdb_idl' with that pointer. */
static void
update_sb_db(struct ovsdb_idl *ovs_idl, struct ovsdb_idl *ovnsb_idl,
- bool *monitor_all_p)
+ bool *monitor_all_p, bool *reset_ovnsb_idl_min_index)
{
const struct ovsrec_open_vswitch *cfg =
ovsrec_open_vswitch_first(ovs_idl);
if (!cfg) {
@@ -476,6 +477,12 @@ update_sb_db(struct ovsdb_idl *ovs_idl, struct ovsdb_idl
*ovnsb_idl,
if (monitor_all_p) {
*monitor_all_p = monitor_all;
}
+ if (*reset_ovnsb_idl_min_index) {
+ VLOG_INFO("Resetting southbound database cluster state");
+ engine_set_force_recompute(true);
+ ovsdb_idl_reset_min_index(ovnsb_idl);
+ *reset_ovnsb_idl_min_index = false;
+ }
}
static void
@@ -1936,6 +1943,11 @@ main(int argc, char *argv[])
unixctl_command_register("recompute", "", 0, 0, engine_recompute_cmd,
NULL);
+ bool reset_ovnsb_idl_min_index = false;
+ unixctl_command_register("sb-cluster-state-reset", "", 0, 0,
+ cluster_state_reset_cmd,
+ &reset_ovnsb_idl_min_index);
+
unsigned int ovs_cond_seqno = UINT_MAX;
unsigned int ovnsb_cond_seqno = UINT_MAX;
@@ -1957,7 +1969,8 @@ main(int argc, char *argv[])
ovs_cond_seqno = new_ovs_cond_seqno;
}
- update_sb_db(ovs_idl_loop.idl, ovnsb_idl_loop.idl, &sb_monitor_all);
+ update_sb_db(ovs_idl_loop.idl, ovnsb_idl_loop.idl, &sb_monitor_all,
+ &reset_ovnsb_idl_min_index);
update_ssl_config(ovsrec_ssl_table_get(ovs_idl_loop.idl));
ofctrl_set_probe_interval(get_ofctrl_probe_interval(ovs_idl_loop.idl));
@@ -2207,7 +2220,7 @@ main(int argc, char *argv[])
if (!restart) {
bool done = !ovsdb_idl_has_ever_connected(ovnsb_idl_loop.idl);
while (!done) {
- update_sb_db(ovs_idl_loop.idl, ovnsb_idl_loop.idl, NULL);
+ update_sb_db(ovs_idl_loop.idl, ovnsb_idl_loop.idl, NULL, false);
update_ssl_config(ovsrec_ssl_table_get(ovs_idl_loop.idl));
struct ovsdb_idl_txn *ovs_idl_txn
@@ -2441,3 +2454,14 @@ engine_recompute_cmd(struct unixctl_conn *conn
OVS_UNUSED, int argc OVS_UNUSED,
poll_immediate_wake();
unixctl_command_reply(conn, NULL);
}
+
+static void
+cluster_state_reset_cmd(struct unixctl_conn *conn, int argc OVS_UNUSED,
+ const char *argv[] OVS_UNUSED, void *idl_reset_)
+{
+ bool *idl_reset = idl_reset_;
+
+ *idl_reset = true;
+ poll_immediate_wake();
+ unixctl_command_reply(conn, NULL);
+}
diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
index 1f817423a..ddb296aaa 100644
--- a/northd/ovn-northd.8.xml
+++ b/northd/ovn-northd.8.xml
@@ -96,6 +96,34 @@
acquired OVSDB lock on SB DB, "standby" if it has not or "paused" if
this instance is paused.
</dd>
+
+ <dt><code>sb-cluster-state-reset</code></dt>
+ <dd>
+ <p>
+ Reset southbound database cluster status when databases are destroyed
+ and rebuilt.
+ </p>
+ <p>
+ If all databases in a clustered southbound database are removed from
+ disk, then the stored index of all databases will be reset to zero.
+ This will cause ovn-northd to be unable to read or write to the
+ southbound database, because it will always detect the data as stale.
+ In such a case, run this command so that ovn-northd will reset its
+ local index so that it can interact with the southbound database again.
+ </p>
+ </dd>
+
+ <dt><code>nb-cluster-state-reset</code></dt>
+ <dd>
+ <p>
+ Reset northbound database cluster status when databases are destroyed
+ and rebuilt.
+ </p>
+ <p>
+ This performs the same task as <code>sb-cluster-state-reset</code>
+ except for the northbound database client.
+ </p>
+ </dd>
</dl>
</p>
diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index 742aad85e..3714488bb 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -56,6 +56,7 @@ static unixctl_cb_func ovn_northd_pause;
static unixctl_cb_func ovn_northd_resume;
static unixctl_cb_func ovn_northd_is_paused;
static unixctl_cb_func ovn_northd_status;
+static unixctl_cb_func cluster_state_reset_cmd;
struct northd_context {
struct ovsdb_idl *ovnnb_idl;
@@ -11783,6 +11784,16 @@ main(int argc, char *argv[])
&state);
unixctl_command_register("status", "", 0, 0, ovn_northd_status, &state);
+ bool reset_ovnsb_idl_min_index = false;
+ unixctl_command_register("sb-cluster-state-reset", "", 0, 0,
+ cluster_state_reset_cmd,
+ &reset_ovnsb_idl_min_index);
+
+ bool reset_ovnnb_idl_min_index = false;
+ unixctl_command_register("nb-cluster-state-reset", "", 0, 0,
+ cluster_state_reset_cmd,
+ &reset_ovnnb_idl_min_index);
+
daemonize_complete();
/* We want to detect (almost) all changes to the ovn-nb db. */
@@ -12068,6 +12079,18 @@ main(int argc, char *argv[])
ovsdb_idl_set_probe_interval(ovnnb_idl_loop.idl,
northd_probe_interval);
ovsdb_idl_set_probe_interval(ovnsb_idl_loop.idl,
northd_probe_interval);
+ if (reset_ovnsb_idl_min_index) {
+ VLOG_INFO("Resetting southbound database cluster state");
+ ovsdb_idl_reset_min_index(ovnsb_idl_loop.idl);
+ reset_ovnsb_idl_min_index = false;
+ }
+
+ if (reset_ovnnb_idl_min_index) {
+ VLOG_INFO("Resetting northbound database cluster state");
+ ovsdb_idl_reset_min_index(ovnnb_idl_loop.idl);
+ reset_ovnnb_idl_min_index = false;
+ }
+
poll_block();
if (should_service_stop()) {
exiting = true;
@@ -12146,3 +12169,14 @@ ovn_northd_status(struct unixctl_conn *conn, int argc
OVS_UNUSED,
unixctl_command_reply(conn, ds_cstr(&s));
ds_destroy(&s);
}
+
+static void
+cluster_state_reset_cmd(struct unixctl_conn *conn, int argc OVS_UNUSED,
+ const char *argv[] OVS_UNUSED, void *idl_reset_)
+{
+ bool *idl_reset = idl_reset_;
+
+ *idl_reset = true;
+ poll_immediate_wake();
+ unixctl_command_reply(conn, NULL);
+}