Acked-by: Han Zhou <[email protected]>
On Fri, May 1, 2020 at 12:15 PM Mark Michelson <[email protected]> wrote: > > This patch is dependent on OVS commit > https://patchwork.ozlabs.org/project/openvswitch/patch/[email protected]/ > > Please do not merge this until the OVS commit has been merged as well. > > On 5/1/20 3:13 PM, Mark Michelson wrote: > > During the course of debugging a clustered DB environment, all members > > of the southbound database cluster were destroyed (i.e. the .db files > > were removed from disk) and then restarted. Once this happened, > > ovn-northd and ovn-controller could not interact with the southbound > > database because they both detected all members of the cluster as having > > "stale" data. The only course of action was to reset ovn-northd and all > > ovn-controllers. It is possible to have this happen with the northbound > > database as well if it is clustered. > > > > This patch offers new ovn-appctl commands for ovn-northd and > > ovn-controller that allows for it to reset its clustered status. This > > allows for it to interact with the database successfully after a cluster > > teardown and restart. > > > > Signed-off-by: Mark Michelson <[email protected]> > > Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1829109 > > --- > > controller/ovn-controller.8.xml | 16 ++++++++++++++++ > > controller/ovn-controller.c | 30 ++++++++++++++++++++++++++--- > > northd/ovn-northd.8.xml | 28 +++++++++++++++++++++++++++ > > northd/ovn-northd.c | 34 +++++++++++++++++++++++++++++++++ > > 4 files changed, 105 insertions(+), 3 deletions(-) > > > > diff --git a/controller/ovn-controller.8.xml b/controller/ovn-controller.8.xml > > index 76bbbdc5f..fe62163fa 100644 > > --- a/controller/ovn-controller.8.xml > > +++ b/controller/ovn-controller.8.xml > > @@ -491,6 +491,22 @@ > > recomputes are cpu intensive. > > </p> > > </dd> > > + > > + <dt><code>sb-cluster-state-reset</code></dt> > > + <dd> > > + <p> > > + Reset southbound database cluster status when databases are destroyed > > + and rebuilt. > > + </p> > > + <p> > > + If all databases in a clustered southbound database are removed from > > + disk, then the stored index of all databases will be reset to zero. > > + This will cause ovn-controller to be unable to read or write to the > > + southbound database, because it will always detect the data as stale. > > + In such a case, run this command so that ovn-controller will reset its > > + local index so that it can interact with the southbound database again. > > + </p> > > + </dd> > > </dl> > > </p> > > > > diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c > > index 6ff897325..1442accd7 100644 > > --- a/controller/ovn-controller.c > > +++ b/controller/ovn-controller.c > > @@ -73,6 +73,7 @@ static unixctl_cb_func extend_table_list; > > static unixctl_cb_func inject_pkt; > > static unixctl_cb_func ovn_controller_conn_show; > > static unixctl_cb_func engine_recompute_cmd; > > +static unixctl_cb_func cluster_state_reset_cmd; > > > > #define DEFAULT_BRIDGE_NAME "br-int" > > #define DEFAULT_PROBE_INTERVAL_MSEC 5000 > > @@ -446,7 +447,7 @@ get_ofctrl_probe_interval(struct ovsdb_idl *ovs_idl) > > * updates 'sbdb_idl' with that pointer. */ > > static void > > update_sb_db(struct ovsdb_idl *ovs_idl, struct ovsdb_idl *ovnsb_idl, > > - bool *monitor_all_p) > > + bool *monitor_all_p, bool *reset_ovnsb_idl_min_index) > > { > > const struct ovsrec_open_vswitch *cfg = ovsrec_open_vswitch_first(ovs_idl); > > if (!cfg) { > > @@ -476,6 +477,12 @@ update_sb_db(struct ovsdb_idl *ovs_idl, struct ovsdb_idl *ovnsb_idl, > > if (monitor_all_p) { > > *monitor_all_p = monitor_all; > > } > > + if (*reset_ovnsb_idl_min_index) { > > + VLOG_INFO("Resetting southbound database cluster state"); > > + engine_set_force_recompute(true); > > + ovsdb_idl_reset_min_index(ovnsb_idl); > > + *reset_ovnsb_idl_min_index = false; > > + } > > } > > > > static void > > @@ -1936,6 +1943,11 @@ main(int argc, char *argv[]) > > unixctl_command_register("recompute", "", 0, 0, engine_recompute_cmd, > > NULL); > > > > + bool reset_ovnsb_idl_min_index = false; > > + unixctl_command_register("sb-cluster-state-reset", "", 0, 0, > > + cluster_state_reset_cmd, > > + &reset_ovnsb_idl_min_index); > > + > > unsigned int ovs_cond_seqno = UINT_MAX; > > unsigned int ovnsb_cond_seqno = UINT_MAX; > > > > @@ -1957,7 +1969,8 @@ main(int argc, char *argv[]) > > ovs_cond_seqno = new_ovs_cond_seqno; > > } > > > > - update_sb_db(ovs_idl_loop.idl, ovnsb_idl_loop.idl, &sb_monitor_all); > > + update_sb_db(ovs_idl_loop.idl, ovnsb_idl_loop.idl, &sb_monitor_all, > > + &reset_ovnsb_idl_min_index); > > update_ssl_config(ovsrec_ssl_table_get(ovs_idl_loop.idl)); > > ofctrl_set_probe_interval(get_ofctrl_probe_interval(ovs_idl_loop.idl)); > > > > @@ -2207,7 +2220,7 @@ main(int argc, char *argv[]) > > if (!restart) { > > bool done = !ovsdb_idl_has_ever_connected(ovnsb_idl_loop.idl); > > while (!done) { > > - update_sb_db(ovs_idl_loop.idl, ovnsb_idl_loop.idl, NULL); > > + update_sb_db(ovs_idl_loop.idl, ovnsb_idl_loop.idl, NULL, false); > > update_ssl_config(ovsrec_ssl_table_get(ovs_idl_loop.idl)); > > > > struct ovsdb_idl_txn *ovs_idl_txn > > @@ -2441,3 +2454,14 @@ engine_recompute_cmd(struct unixctl_conn *conn OVS_UNUSED, int argc OVS_UNUSED, > > poll_immediate_wake(); > > unixctl_command_reply(conn, NULL); > > } > > + > > +static void > > +cluster_state_reset_cmd(struct unixctl_conn *conn, int argc OVS_UNUSED, > > + const char *argv[] OVS_UNUSED, void *idl_reset_) > > +{ > > + bool *idl_reset = idl_reset_; > > + > > + *idl_reset = true; > > + poll_immediate_wake(); > > + unixctl_command_reply(conn, NULL); > > +} > > diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml > > index 1f817423a..ddb296aaa 100644 > > --- a/northd/ovn-northd.8.xml > > +++ b/northd/ovn-northd.8.xml > > @@ -96,6 +96,34 @@ > > acquired OVSDB lock on SB DB, "standby" if it has not or "paused" if > > this instance is paused. > > </dd> > > + > > + <dt><code>sb-cluster-state-reset</code></dt> > > + <dd> > > + <p> > > + Reset southbound database cluster status when databases are destroyed > > + and rebuilt. > > + </p> > > + <p> > > + If all databases in a clustered southbound database are removed from > > + disk, then the stored index of all databases will be reset to zero. > > + This will cause ovn-northd to be unable to read or write to the > > + southbound database, because it will always detect the data as stale. > > + In such a case, run this command so that ovn-northd will reset its > > + local index so that it can interact with the southbound database again. > > + </p> > > + </dd> > > + > > + <dt><code>nb-cluster-state-reset</code></dt> > > + <dd> > > + <p> > > + Reset northbound database cluster status when databases are destroyed > > + and rebuilt. > > + </p> > > + <p> > > + This performs the same task as <code>sb-cluster-state-reset</code> > > + except for the northbound database client. > > + </p> > > + </dd> > > </dl> > > </p> > > > > diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c > > index 742aad85e..3714488bb 100644 > > --- a/northd/ovn-northd.c > > +++ b/northd/ovn-northd.c > > @@ -56,6 +56,7 @@ static unixctl_cb_func ovn_northd_pause; > > static unixctl_cb_func ovn_northd_resume; > > static unixctl_cb_func ovn_northd_is_paused; > > static unixctl_cb_func ovn_northd_status; > > +static unixctl_cb_func cluster_state_reset_cmd; > > > > struct northd_context { > > struct ovsdb_idl *ovnnb_idl; > > @@ -11783,6 +11784,16 @@ main(int argc, char *argv[]) > > &state); > > unixctl_command_register("status", "", 0, 0, ovn_northd_status, &state); > > > > + bool reset_ovnsb_idl_min_index = false; > > + unixctl_command_register("sb-cluster-state-reset", "", 0, 0, > > + cluster_state_reset_cmd, > > + &reset_ovnsb_idl_min_index); > > + > > + bool reset_ovnnb_idl_min_index = false; > > + unixctl_command_register("nb-cluster-state-reset", "", 0, 0, > > + cluster_state_reset_cmd, > > + &reset_ovnnb_idl_min_index); > > + > > daemonize_complete(); > > > > /* We want to detect (almost) all changes to the ovn-nb db. */ > > @@ -12068,6 +12079,18 @@ main(int argc, char *argv[]) > > ovsdb_idl_set_probe_interval(ovnnb_idl_loop.idl, northd_probe_interval); > > ovsdb_idl_set_probe_interval(ovnsb_idl_loop.idl, northd_probe_interval); > > > > + if (reset_ovnsb_idl_min_index) { > > + VLOG_INFO("Resetting southbound database cluster state"); > > + ovsdb_idl_reset_min_index(ovnsb_idl_loop.idl); > > + reset_ovnsb_idl_min_index = false; > > + } > > + > > + if (reset_ovnnb_idl_min_index) { > > + VLOG_INFO("Resetting northbound database cluster state"); > > + ovsdb_idl_reset_min_index(ovnnb_idl_loop.idl); > > + reset_ovnnb_idl_min_index = false; > > + } > > + > > poll_block(); > > if (should_service_stop()) { > > exiting = true; > > @@ -12146,3 +12169,14 @@ ovn_northd_status(struct unixctl_conn *conn, int argc OVS_UNUSED, > > unixctl_command_reply(conn, ds_cstr(&s)); > > ds_destroy(&s); > > } > > + > > +static void > > +cluster_state_reset_cmd(struct unixctl_conn *conn, int argc OVS_UNUSED, > > + const char *argv[] OVS_UNUSED, void *idl_reset_) > > +{ > > + bool *idl_reset = idl_reset_; > > + > > + *idl_reset = true; > > + poll_immediate_wake(); > > + unixctl_command_reply(conn, NULL); > > +} > > > > _______________________________________________ > dev mailing list > [email protected] > https://mail.openvswitch.org/mailman/listinfo/ovs-dev _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
